pyconll 3.1.0.dev3__tar.gz → 3.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/MANIFEST.in +0 -2
- pyconll-3.3.0/PKG-INFO +138 -0
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/README.md +1 -1
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll/__init__.py +2 -2
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll/_parser.py +18 -8
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll/_version.py +1 -1
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll/conllable.py +1 -0
- pyconll-3.3.0/pyconll/load.py +132 -0
- pyconll-3.3.0/pyconll/py.typed +0 -0
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll/tree/_treebuilder.py +5 -4
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll/tree/tree.py +2 -2
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll/unit/conll.py +9 -5
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll/unit/sentence.py +31 -17
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll/unit/token.py +40 -33
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll/util.py +5 -4
- pyconll-3.3.0/pyconll.egg-info/PKG-INFO +138 -0
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll.egg-info/SOURCES.txt +5 -3
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/setup.py +4 -3
- pyconll-3.3.0/tests/test_conllable.py +27 -0
- pyconll-3.3.0/tests/test_load.py +110 -0
- pyconll-3.3.0/tests/test_util.py +213 -0
- pyconll-3.1.0.dev3/PKG-INFO +0 -192
- pyconll-3.1.0.dev3/README +0 -140
- pyconll-3.1.0.dev3/README.rst +0 -171
- pyconll-3.1.0.dev3/pyconll/load.py +0 -91
- pyconll-3.1.0.dev3/pyconll.egg-info/PKG-INFO +0 -192
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/LICENSE +0 -0
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll/exception.py +0 -0
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll/tree/__init__.py +0 -0
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll/unit/__init__.py +0 -0
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll.egg-info/dependency_links.txt +0 -0
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/pyconll.egg-info/top_level.txt +0 -0
- {pyconll-3.1.0.dev3 → pyconll-3.3.0}/setup.cfg +0 -0
pyconll-3.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyconll
|
|
3
|
+
Version: 3.3.0
|
|
4
|
+
Summary: Read and manipulate CoNLL files
|
|
5
|
+
Home-page: https://github.com/pyconll/pyconll
|
|
6
|
+
Author: Matias Grioni
|
|
7
|
+
Author-email: matgrioni@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: nlp,conllu,conll,universal dependencies
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Education
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering
|
|
17
|
+
Classifier: Topic :: Utilities
|
|
18
|
+
Requires-Python: ~=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Dynamic: author
|
|
22
|
+
Dynamic: author-email
|
|
23
|
+
Dynamic: classifier
|
|
24
|
+
Dynamic: description
|
|
25
|
+
Dynamic: description-content-type
|
|
26
|
+
Dynamic: home-page
|
|
27
|
+
Dynamic: keywords
|
|
28
|
+
Dynamic: license
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-python
|
|
31
|
+
Dynamic: summary
|
|
32
|
+
|
|
33
|
+
[](https://github.com/pyconll/pyconll)
|
|
34
|
+
[](https://coveralls.io/github/pyconll/pyconll?branch=master)
|
|
35
|
+
[](https://pyconll.readthedocs.io/en/stable)
|
|
36
|
+
[](https://github.com/pyconll/pyconll/releases)
|
|
37
|
+
[](https://gitter.im/pyconll/pyconll?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
|
38
|
+
|
|
39
|
+
## pyconll
|
|
40
|
+
|
|
41
|
+
*Easily work with **CoNLL** files using the familiar syntax of **python**.*
|
|
42
|
+
|
|
43
|
+
<img src="res/logo.svg" width="256px" height="256px">
|
|
44
|
+
|
|
45
|
+
##### Links
|
|
46
|
+
- [Homepage](https://pyconll.github.io)
|
|
47
|
+
- [Documentation](https://pyconll.readthedocs.io/)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
### Installation
|
|
51
|
+
|
|
52
|
+
As with most python packages, simply use `pip` to install from PyPi.
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
pip install pyconll
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
`pyconll` is also available as a conda package on the `pyconll` channel. Only packages 2.2.0 and newer are available on conda at the moment.
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
conda install -c pyconll pyconll
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
pyconll supports Python 3.10 or newer. In general, pyconll will focus development efforts on officially supported python versions.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
### Use
|
|
68
|
+
|
|
69
|
+
This tool is intended to be a **minimal**, **low level**, **expressive** and **pragmatic** library in a widely used programming language. pyconll creates a thin API on top of raw CoNLL annotations that is simple and intuitive.
|
|
70
|
+
|
|
71
|
+
It offers the following features:
|
|
72
|
+
* Regular CI testing and validation against all UD v2.x versions.
|
|
73
|
+
* A strong domain model that includes CoNLL sources, Sentences, Tokens, Trees, etc.
|
|
74
|
+
* A typed API for better development experience and better semantics.
|
|
75
|
+
* A focus on usability and simplicity in design (no dependencies)
|
|
76
|
+
* Performance optimizations for a smooth development workflow no matter the dataset size (performs about 25%-35% faster than other comparable packages)
|
|
77
|
+
|
|
78
|
+
See the following code example to understand the basics of the API.
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
# This snippet finds sentences where a token marked with part of speech 'AUX' are
|
|
82
|
+
# governed by a NOUN. For example, in French this is a less common construction
|
|
83
|
+
# and we may want to validate these examples because we have previously found some
|
|
84
|
+
# problematic examples of this construction.
|
|
85
|
+
import pyconll
|
|
86
|
+
|
|
87
|
+
train = pyconll.load_from_file('./ud/train.conllu')
|
|
88
|
+
|
|
89
|
+
review_sentences = []
|
|
90
|
+
|
|
91
|
+
# Conll objects are iterable over their sentences, and sentences are iterable
|
|
92
|
+
# over their tokens. Sentences also de/serialize comment information.
|
|
93
|
+
for sentence in train:
|
|
94
|
+
for token in sentence:
|
|
95
|
+
|
|
96
|
+
# Tokens have attributes such as upos, head, id, deprel, etc, and sentences
|
|
97
|
+
# can be indexed by a token's id. We must check that the token is not the
|
|
98
|
+
# root token, whose id, '0', cannot be looked up.
|
|
99
|
+
if token.upos == 'AUX' and (token.head != '0' and sentence[token.head].upos == 'NOUN'):
|
|
100
|
+
review_sentences.append(sentence)
|
|
101
|
+
|
|
102
|
+
print('Review the following sentences:')
|
|
103
|
+
for sent in review_sentences:
|
|
104
|
+
print(sent.id)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
A full definition of the API can be found in the [documentation](https://pyconll.readthedocs.io/) or use the [quick start](https://pyconll.readthedocs.io/en/stable/starting.html) guide for a focused introduction.
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
### Uses and Limitations
|
|
111
|
+
|
|
112
|
+
This package edits CoNLL-U annotations. This does not include the annotated text itself. Word forms on Tokens are not editable and Sentence Tokens cannot be reassigned or reordered. `pyconll` focuses on editing CoNLL-U annotation rather than creating it or changing the underlying text that is annotated. If there is interest in this functionality area, please create a GitHub issue for more visibility.
|
|
113
|
+
|
|
114
|
+
This package also is only validated against the CoNLL-U format. The CoNLL and CoNLL-X format are not supported, but are very similar. I originally intended to support these formats as well, but their format is not as well defined as CoNLL-U so they are not included. Please create an issue for visibility if this feature interests you.
|
|
115
|
+
|
|
116
|
+
Lastly, linguistic data can often be very large and this package attempts to keep that in mind. pyconll provides methods for creating in memory conll objects along with an iterate only version in case a corpus is too large to store in memory (the size of the memory structure is several times larger than the actual corpus file). The iterate only version can parse upwards of 100,000 words per second on a 16gb ram machine, so for most datasets to be used on a local dev machine, this package will perform well. The 2.2.0 release also improves parse time and memory footprint by about 25%!
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
### Contributing
|
|
120
|
+
|
|
121
|
+
Contributions to this project are welcome and encouraged! If you are unsure how to contribute, here is a [guide](https://help.github.com/en/articles/creating-a-pull-request-from-a-fork) from Github explaining the basic workflow. After cloning this repo, please run `pip install -r requirements.txt` to properly setup locally. Some of these tools like yapf, pylint, and mypy do not have to be run locally, but CI builds will fail without their successful running. Some other release dependencies like twine and sphinx are also installed.
|
|
122
|
+
|
|
123
|
+
For packaging new versions, use setuptools version 24.2.0 or greater for creating the appropriate packaging that recognizes the `python_requires` metadata. Final packaging and release is now done with Github actions so this is less of a concern.
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
#### README and CHANGELOG
|
|
127
|
+
|
|
128
|
+
When changing either of these files, please change the Markdown version and run ``make gendocs`` so that the other versions stay in sync.
|
|
129
|
+
|
|
130
|
+
#### Release Checklist
|
|
131
|
+
|
|
132
|
+
Below enumerates the general release process explicitly. This section is for internal use and most people do not have to worry about this. First note, that the dev branch is always a direct extension of master with the latest changes since the last release. That is, it is essentially a staging release branch.
|
|
133
|
+
|
|
134
|
+
* Change the version in `pyconll/_version.py` appropriately.
|
|
135
|
+
* Merge dev into master **locally**. Github does not offer a fast forward merge and explicitly uses --no-ff. So to keep the linear nature of changes, merge locally to fast forward. This is assuming that the dev branch looks good on CI tests which do not automatically run in this situation.
|
|
136
|
+
* Push the master branch. This should start some CI tests specifically for master. After validating these results, create a tag corresponding to the next version number and push the tag.
|
|
137
|
+
* Create a new release from this tag from the [Releases page](https://github.com/pyconll/pyconll/releases). On creating this release, two workflows will start. One releases to pypi, and the other releases to conda.
|
|
138
|
+
* Validate these workflows pass, and the package is properly released on both platforms.
|
|
@@ -29,7 +29,7 @@ pip install pyconll
|
|
|
29
29
|
conda install -c pyconll pyconll
|
|
30
30
|
```
|
|
31
31
|
|
|
32
|
-
pyconll supports Python 3.
|
|
32
|
+
pyconll supports Python 3.10 or newer. In general, pyconll will focus development efforts on officially supported python versions.
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
### Use
|
|
@@ -5,6 +5,6 @@ and python code.
|
|
|
5
5
|
|
|
6
6
|
__all__ = ['conllable', 'exception', 'load', 'tree', 'unit', 'util']
|
|
7
7
|
|
|
8
|
-
from .load import load_from_string, load_from_file,
|
|
9
|
-
iter_from_file
|
|
8
|
+
from .load import load_from_string, load_from_file, load_from_resource, \
|
|
9
|
+
iter_from_string, iter_from_file, iter_from_resource
|
|
10
10
|
from ._version import __version__
|
|
@@ -6,15 +6,18 @@ can then be used in the Conll class or in pyconll.load.
|
|
|
6
6
|
|
|
7
7
|
from typing import Iterable, Iterator
|
|
8
8
|
|
|
9
|
+
from pyconll.exception import ParseError
|
|
9
10
|
from pyconll.unit.sentence import Sentence
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
def _create_sentence(sent_lines: Iterable[str]) -> Sentence:
|
|
13
|
+
def _create_sentence(sent_lines: Iterable[str], line_num: int) -> Sentence:
|
|
13
14
|
"""
|
|
14
15
|
Creates a Sentence object given the current state of the source iteration.
|
|
15
16
|
|
|
16
17
|
Args:
|
|
17
18
|
sent_lines: An iterable of the lines that make up the source.
|
|
19
|
+
line_num: The current line number the sentence starts at, for logging
|
|
20
|
+
purposes.
|
|
18
21
|
|
|
19
22
|
Returns:
|
|
20
23
|
The created Sentence.
|
|
@@ -23,7 +26,11 @@ def _create_sentence(sent_lines: Iterable[str]) -> Sentence:
|
|
|
23
26
|
ParseError: If the sentence source is not valid.
|
|
24
27
|
"""
|
|
25
28
|
sent_source = '\n'.join(sent_lines)
|
|
26
|
-
|
|
29
|
+
try:
|
|
30
|
+
sentence = Sentence(sent_source)
|
|
31
|
+
except ParseError as err:
|
|
32
|
+
raise ParseError(
|
|
33
|
+
f'Failed to create sentence at line {line_num}') from err
|
|
27
34
|
|
|
28
35
|
return sentence
|
|
29
36
|
|
|
@@ -44,19 +51,22 @@ def iter_sentences(lines_it: Iterable[str]) -> Iterator[Sentence]:
|
|
|
44
51
|
ValueError: If there is an error constructing the Sentence.
|
|
45
52
|
"""
|
|
46
53
|
sent_lines = []
|
|
47
|
-
|
|
54
|
+
last_empty_line = -1
|
|
55
|
+
for i, line in enumerate(lines_it):
|
|
48
56
|
line = line.strip()
|
|
49
57
|
|
|
50
58
|
# Collect all lines until there is a blank line. Then all the
|
|
51
59
|
# collected lines were between blank lines and are a sentence.
|
|
52
60
|
if line:
|
|
53
61
|
sent_lines.append(line)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
62
|
+
else:
|
|
63
|
+
if sent_lines:
|
|
64
|
+
sentence = _create_sentence(sent_lines, last_empty_line + 2)
|
|
65
|
+
sent_lines.clear()
|
|
66
|
+
yield sentence
|
|
57
67
|
|
|
58
|
-
|
|
68
|
+
last_empty_line = i
|
|
59
69
|
|
|
60
70
|
if sent_lines:
|
|
61
|
-
sentence = _create_sentence(sent_lines)
|
|
71
|
+
sentence = _create_sentence(sent_lines, last_empty_line)
|
|
62
72
|
yield sentence
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""
|
|
2
|
+
A wrapper around the Conll class to easily load treebanks from multiple formats.
|
|
3
|
+
This module can also load resources by iterating over treebank data without
|
|
4
|
+
storing Conll objects in memory. This module is the main entrance to pyconll's
|
|
5
|
+
functionalities.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from typing import Iterable, Iterator
|
|
10
|
+
|
|
11
|
+
from pyconll._parser import iter_sentences
|
|
12
|
+
from pyconll.unit.conll import Conll
|
|
13
|
+
from pyconll.unit.sentence import Sentence
|
|
14
|
+
|
|
15
|
+
PathLike = str | bytes | os.PathLike
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def load_from_string(source: str) -> Conll:
|
|
19
|
+
"""
|
|
20
|
+
Load the CoNLL-U source in a string into a Conll object.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
source: The CoNLL-U formatted string.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
A Conll object equivalent to the provided source.
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
ParseError: If there is an error parsing the input into a Conll object.
|
|
30
|
+
"""
|
|
31
|
+
lines = source.splitlines()
|
|
32
|
+
c = Conll(lines)
|
|
33
|
+
|
|
34
|
+
return c
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def load_from_file(file_descriptor: PathLike) -> Conll:
|
|
38
|
+
"""
|
|
39
|
+
Load a CoNLL-U file given its location.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
file_descriptor: The file to load the CoNLL-U data from. This can be a
|
|
43
|
+
filepath as a Path object, or string, or a file descriptor.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
A Conll object equivalent to the provided file.
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
IOError: If there is an error opening the given filename.
|
|
50
|
+
ParseError: If there is an error parsing the input into a Conll object.
|
|
51
|
+
"""
|
|
52
|
+
with open(file_descriptor, encoding='utf-8') as f:
|
|
53
|
+
c = Conll(f)
|
|
54
|
+
|
|
55
|
+
return c
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def load_from_resource(resource: Iterable[str]) -> Conll:
|
|
59
|
+
"""
|
|
60
|
+
Load a CoNLL-U file from a generic string resource.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
resource: The generic string resource. Each string from the resource is
|
|
64
|
+
assumed to be a line in a CoNLL-U formatted resource.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
A Conll object equivalent to the string resource provided.
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
ParseError: If there is an error parsing the input into a Conll object.
|
|
71
|
+
"""
|
|
72
|
+
return Conll(resource)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def iter_from_string(source: str) -> Iterator[Sentence]:
|
|
76
|
+
"""
|
|
77
|
+
Iterate over a CoNLL-U string's sentences.
|
|
78
|
+
|
|
79
|
+
Use this method if you only need to iterate over the CoNLL-U file once and
|
|
80
|
+
do not need to create or store the Conll object.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
source: The CoNLL-U string.
|
|
84
|
+
|
|
85
|
+
Yields:
|
|
86
|
+
The sentences that make up the CoNLL-U file.
|
|
87
|
+
|
|
88
|
+
Raises:
|
|
89
|
+
ParseError: If there is an error parsing the input into a Conll object.
|
|
90
|
+
"""
|
|
91
|
+
lines = source.splitlines()
|
|
92
|
+
yield from iter_sentences(lines)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def iter_from_file(file_descriptor: PathLike) -> Iterator[Sentence]:
|
|
96
|
+
"""
|
|
97
|
+
Iterate over a CoNLL-U file's sentences.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
file_descriptor: The file to iterate the CoNLL-U data from. This can be a
|
|
101
|
+
filepath as a Path object, or string, or a file descriptor.
|
|
102
|
+
|
|
103
|
+
Yields:
|
|
104
|
+
The sentences that make up the CoNLL-U file.
|
|
105
|
+
|
|
106
|
+
Raises:
|
|
107
|
+
IOError: If there is an error opening the file.
|
|
108
|
+
ParseError: If there is an error parsing the input into a Conll object.
|
|
109
|
+
"""
|
|
110
|
+
with open(file_descriptor, encoding='utf-8') as f:
|
|
111
|
+
yield from iter_sentences(f)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def iter_from_resource(resource: Iterable[str]) -> Iterator[Sentence]:
|
|
115
|
+
"""
|
|
116
|
+
Iterate over the sentences from an iterable string resource.
|
|
117
|
+
|
|
118
|
+
This is a generic method that allows for any general resource that can
|
|
119
|
+
provide data (like a streaming network request or memory mapped data) to be
|
|
120
|
+
parsed as a CoNLL-U data source.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
resource: The line source. Each iterated string should be a line in a
|
|
124
|
+
CoNLL-U formatted file.
|
|
125
|
+
|
|
126
|
+
Yields:
|
|
127
|
+
The sentences that make up the CoNLL-U file.
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
ParseError: If there is an error parsing the input into a Conll object.
|
|
131
|
+
"""
|
|
132
|
+
yield from iter_sentences(resource)
|
|
File without changes
|
|
@@ -23,6 +23,7 @@ class TreeBuilder(Generic[T]):
|
|
|
23
23
|
created from the same TreeBuilder, Tree nodes will be unique, but data on
|
|
24
24
|
the nodes will be shallow copies.
|
|
25
25
|
"""
|
|
26
|
+
|
|
26
27
|
def __init__(self) -> None:
|
|
27
28
|
"""
|
|
28
29
|
Creates a new empty TreeBuilder, with no internal data.
|
|
@@ -76,8 +77,8 @@ class TreeBuilder(Generic[T]):
|
|
|
76
77
|
self.current = self.current[i]
|
|
77
78
|
except IndexError as e:
|
|
78
79
|
raise IndexError(
|
|
79
|
-
'{}-th child is out of range. There are {} children on this node'
|
|
80
|
-
|
|
80
|
+
f'{i}-th child is out of range. There are {len(self.current)} children on this node'
|
|
81
|
+
) from e
|
|
81
82
|
|
|
82
83
|
def move_to_root(self) -> None:
|
|
83
84
|
"""
|
|
@@ -121,8 +122,8 @@ class TreeBuilder(Generic[T]):
|
|
|
121
122
|
del self.current._children[i]
|
|
122
123
|
except IndexError as e:
|
|
123
124
|
raise IndexError(
|
|
124
|
-
'{}-th child is out of range. There are {} children on this node'
|
|
125
|
-
|
|
125
|
+
f'{i}-th child is out of range. There are {len(self.current)} children on this node'
|
|
126
|
+
) from e
|
|
126
127
|
|
|
127
128
|
def add_child(self, data: T, move: bool = False) -> None:
|
|
128
129
|
"""
|
|
@@ -18,6 +18,7 @@ class Tree(Generic[T]):
|
|
|
18
18
|
module which is a sort of friend class of Tree to maintain its immutable
|
|
19
19
|
public contract.
|
|
20
20
|
"""
|
|
21
|
+
|
|
21
22
|
def __init__(self, data: T) -> None:
|
|
22
23
|
"""
|
|
23
24
|
Create a tree holding the value. Create a larger Tree, with TreeBuilder.
|
|
@@ -70,8 +71,7 @@ class Tree(Generic[T]):
|
|
|
70
71
|
"""
|
|
71
72
|
Provides an iterator over the children.
|
|
72
73
|
"""
|
|
73
|
-
|
|
74
|
-
yield child
|
|
74
|
+
yield from self._children
|
|
75
75
|
|
|
76
76
|
def __len__(self) -> int:
|
|
77
77
|
"""
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
Defines the Conll type and the associated parsing and output logic.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from typing import Any, Iterable, Iterator,
|
|
5
|
+
from typing import Any, Iterable, Iterator, MutableSequence, overload
|
|
6
6
|
|
|
7
7
|
import pyconll._parser
|
|
8
8
|
from pyconll.conllable import Conllable
|
|
@@ -17,6 +17,7 @@ class Conll(MutableSequence[Sentence], Conllable):
|
|
|
17
17
|
specifies that the file must end in a new line but that requirement is
|
|
18
18
|
relaxed here in parsing.
|
|
19
19
|
"""
|
|
20
|
+
|
|
20
21
|
def __init__(self, it: Iterable[str]) -> None:
|
|
21
22
|
"""
|
|
22
23
|
Create a CoNLL-U file collection of sentences.
|
|
@@ -28,7 +29,7 @@ class Conll(MutableSequence[Sentence], Conllable):
|
|
|
28
29
|
ParseError: If there is an error constructing the sentences in the
|
|
29
30
|
iterator.
|
|
30
31
|
"""
|
|
31
|
-
self._sentences:
|
|
32
|
+
self._sentences: list[Sentence] = []
|
|
32
33
|
|
|
33
34
|
for sentence in pyconll._parser.iter_sentences(it):
|
|
34
35
|
self._sentences.append(sentence)
|
|
@@ -39,6 +40,10 @@ class Conll(MutableSequence[Sentence], Conllable):
|
|
|
39
40
|
|
|
40
41
|
Returns:
|
|
41
42
|
The CoNLL-U object as a string. This string will end in a newline.
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
FormatError: If there are issues converting the sentences to the
|
|
46
|
+
CoNLL format.
|
|
42
47
|
"""
|
|
43
48
|
# Add newlines along with sentence strings so that there is no need to
|
|
44
49
|
# slice potentially long lists or modify strings.
|
|
@@ -95,8 +100,7 @@ class Conll(MutableSequence[Sentence], Conllable):
|
|
|
95
100
|
Yields:
|
|
96
101
|
An iterator over the sentences in this Conll object.
|
|
97
102
|
"""
|
|
98
|
-
|
|
99
|
-
yield sentence
|
|
103
|
+
yield from self._sentences
|
|
100
104
|
|
|
101
105
|
@overload
|
|
102
106
|
def __getitem__(self, key: int) -> Sentence:
|
|
@@ -152,7 +156,7 @@ class Conll(MutableSequence[Sentence], Conllable):
|
|
|
152
156
|
"""
|
|
153
157
|
self._sentences[key] = item
|
|
154
158
|
|
|
155
|
-
def __delitem__(self, key:
|
|
159
|
+
def __delitem__(self, key: int | slice) -> None:
|
|
156
160
|
"""
|
|
157
161
|
Delete the Sentence corresponding with the given key.
|
|
158
162
|
|
|
@@ -4,9 +4,10 @@ Defines the Sentence type and the associated parsing and output logic.
|
|
|
4
4
|
|
|
5
5
|
from collections import OrderedDict
|
|
6
6
|
import re
|
|
7
|
-
from typing import ClassVar,
|
|
7
|
+
from typing import ClassVar, Iterator, Optional, Sequence, overload
|
|
8
8
|
|
|
9
9
|
from pyconll.conllable import Conllable
|
|
10
|
+
from pyconll.exception import FormatError, ParseError
|
|
10
11
|
from pyconll.tree._treebuilder import TreeBuilder
|
|
11
12
|
from pyconll.tree.tree import Tree
|
|
12
13
|
from pyconll.unit.token import Token
|
|
@@ -61,10 +62,10 @@ class Sentence(Sequence[Token], Conllable):
|
|
|
61
62
|
lines = source.split('\n')
|
|
62
63
|
|
|
63
64
|
self._meta: OrderedDict[str, Optional[str]] = OrderedDict() # pylint: disable=E1136
|
|
64
|
-
self._tokens:
|
|
65
|
-
self._ids_to_indexes:
|
|
65
|
+
self._tokens: list[Token] = []
|
|
66
|
+
self._ids_to_indexes: dict[str, int] = {}
|
|
66
67
|
|
|
67
|
-
for line in lines:
|
|
68
|
+
for i, line in enumerate(lines):
|
|
68
69
|
if line:
|
|
69
70
|
if line[0] == Sentence.COMMENT_MARKER:
|
|
70
71
|
kv_match = re.match(Sentence.KEY_VALUE_COMMENT_PATTERN,
|
|
@@ -81,7 +82,13 @@ class Sentence(Sequence[Token], Conllable):
|
|
|
81
82
|
k = singleton_match.group(1)
|
|
82
83
|
self._meta[k] = None
|
|
83
84
|
else:
|
|
84
|
-
|
|
85
|
+
try:
|
|
86
|
+
token = Token(line)
|
|
87
|
+
except ParseError as err:
|
|
88
|
+
raise ParseError(
|
|
89
|
+
f'Error creating token on line {i} for the current sentence'
|
|
90
|
+
) from err
|
|
91
|
+
|
|
85
92
|
self._tokens.append(token)
|
|
86
93
|
|
|
87
94
|
if token.id is not None:
|
|
@@ -163,7 +170,7 @@ class Sentence(Sequence[Token], Conllable):
|
|
|
163
170
|
singleton, this field can be ignored or set to None.
|
|
164
171
|
"""
|
|
165
172
|
if key == Sentence.TEXT_KEY:
|
|
166
|
-
raise ValueError('Key cannot be {
|
|
173
|
+
raise ValueError(f'Key cannot be {Sentence.TEXT_KEY}')
|
|
167
174
|
|
|
168
175
|
self._meta[key] = value
|
|
169
176
|
|
|
@@ -179,7 +186,7 @@ class Sentence(Sequence[Token], Conllable):
|
|
|
179
186
|
ValueError: If the text key is provided, regardless of presence.
|
|
180
187
|
"""
|
|
181
188
|
if key == Sentence.TEXT_KEY:
|
|
182
|
-
raise ValueError('Key cannot be {
|
|
189
|
+
raise ValueError(f'Key cannot be {Sentence.TEXT_KEY}')
|
|
183
190
|
|
|
184
191
|
del self._meta[key]
|
|
185
192
|
|
|
@@ -205,7 +212,7 @@ class Sentence(Sequence[Token], Conllable):
|
|
|
205
212
|
ValueError: If the sentence can not be made into a tree because a
|
|
206
213
|
token has an empty head value or if there is no root token.
|
|
207
214
|
"""
|
|
208
|
-
children_tokens:
|
|
215
|
+
children_tokens: dict[str, list[Token]] = {}
|
|
209
216
|
|
|
210
217
|
for token in self:
|
|
211
218
|
if token.head is not None:
|
|
@@ -215,8 +222,8 @@ class Sentence(Sequence[Token], Conllable):
|
|
|
215
222
|
children_tokens[token.head] = [token]
|
|
216
223
|
elif not (token.is_multiword() or token.is_empty_node()):
|
|
217
224
|
raise ValueError(
|
|
218
|
-
'The current sentence is not fully defined as a tree and '
|
|
219
|
-
'
|
|
225
|
+
'The current sentence is not fully defined as a tree and has a token with an '
|
|
226
|
+
f'empty head at {token.id}')
|
|
220
227
|
|
|
221
228
|
builder: TreeBuilder[Token] = TreeBuilder()
|
|
222
229
|
if '0' in children_tokens:
|
|
@@ -237,7 +244,7 @@ class Sentence(Sequence[Token], Conllable):
|
|
|
237
244
|
@staticmethod
|
|
238
245
|
def _create_tree_helper(builder: TreeBuilder, sentence: 'Sentence',
|
|
239
246
|
root: Token,
|
|
240
|
-
children_tokens:
|
|
247
|
+
children_tokens: dict[str, list[Token]]) -> None:
|
|
241
248
|
"""
|
|
242
249
|
Method to create a tree from a sentence given the root token.
|
|
243
250
|
|
|
@@ -267,19 +274,27 @@ class Sentence(Sequence[Token], Conllable):
|
|
|
267
274
|
|
|
268
275
|
Returns:
|
|
269
276
|
A string representing the Sentence in CoNLL-U format.
|
|
277
|
+
|
|
278
|
+
Raises:
|
|
279
|
+
FormatError: If the Sentence or underlying Tokens can not be
|
|
280
|
+
converted to the CoNLL format.
|
|
270
281
|
"""
|
|
271
282
|
lines = []
|
|
272
283
|
for meta in self._meta.items():
|
|
273
284
|
if meta[1] is not None:
|
|
274
|
-
line = '{} {} = {
|
|
275
|
-
meta[1])
|
|
285
|
+
line = f'{Sentence.COMMENT_MARKER} {meta[0]} = {meta[1]}'
|
|
276
286
|
else:
|
|
277
|
-
line = '{
|
|
287
|
+
line = f'{Sentence.COMMENT_MARKER} {meta[0]}'
|
|
278
288
|
|
|
279
289
|
lines.append(line)
|
|
280
290
|
|
|
281
291
|
for token in self._tokens:
|
|
282
|
-
|
|
292
|
+
try:
|
|
293
|
+
lines.append(token.conll())
|
|
294
|
+
except FormatError as err:
|
|
295
|
+
raise FormatError(
|
|
296
|
+
f'Error serializing sentence with id {self.id} on token \'{token.id}\'.'
|
|
297
|
+
) from err
|
|
283
298
|
|
|
284
299
|
return '\n'.join(lines)
|
|
285
300
|
|
|
@@ -288,8 +303,7 @@ class Sentence(Sequence[Token], Conllable):
|
|
|
288
303
|
Iterate through all the tokens in the Sentence including multiword
|
|
289
304
|
tokens.
|
|
290
305
|
"""
|
|
291
|
-
|
|
292
|
-
yield token
|
|
306
|
+
yield from self._tokens
|
|
293
307
|
|
|
294
308
|
@overload
|
|
295
309
|
def __getitem__(self, key: str) -> Token:
|