pyconll 3.2.0__tar.gz → 3.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyconll-3.2.0 → pyconll-3.3.1}/MANIFEST.in +0 -2
- pyconll-3.3.1/PKG-INFO +138 -0
- {pyconll-3.2.0 → pyconll-3.3.1}/README.md +1 -1
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/_version.py +1 -1
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/conllable.py +1 -0
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/load.py +5 -8
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/tree/_treebuilder.py +1 -0
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/tree/tree.py +2 -2
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/unit/conll.py +5 -5
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/unit/sentence.py +6 -7
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/unit/token.py +14 -11
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/util.py +5 -4
- pyconll-3.3.1/pyconll.egg-info/PKG-INFO +138 -0
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll.egg-info/SOURCES.txt +4 -3
- {pyconll-3.2.0 → pyconll-3.3.1}/setup.py +3 -3
- pyconll-3.3.1/tests/test_conllable.py +27 -0
- pyconll-3.3.1/tests/test_load.py +110 -0
- pyconll-3.3.1/tests/test_util.py +213 -0
- pyconll-3.2.0/PKG-INFO +0 -190
- pyconll-3.2.0/README +0 -137
- pyconll-3.2.0/README.rst +0 -169
- pyconll-3.2.0/pyconll.egg-info/PKG-INFO +0 -190
- {pyconll-3.2.0 → pyconll-3.3.1}/LICENSE +0 -0
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/__init__.py +0 -0
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/_parser.py +0 -0
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/exception.py +0 -0
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/py.typed +0 -0
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/tree/__init__.py +0 -0
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll/unit/__init__.py +0 -0
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll.egg-info/dependency_links.txt +0 -0
- {pyconll-3.2.0 → pyconll-3.3.1}/pyconll.egg-info/top_level.txt +0 -0
- {pyconll-3.2.0 → pyconll-3.3.1}/setup.cfg +0 -0
pyconll-3.3.1/PKG-INFO
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyconll
|
|
3
|
+
Version: 3.3.1
|
|
4
|
+
Summary: Read and manipulate CoNLL files
|
|
5
|
+
Home-page: https://github.com/pyconll/pyconll
|
|
6
|
+
Author: Matias Grioni
|
|
7
|
+
Author-email: matgrioni@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: nlp,conllu,conll,universal dependencies
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Education
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering
|
|
17
|
+
Classifier: Topic :: Utilities
|
|
18
|
+
Requires-Python: ~=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Dynamic: author
|
|
22
|
+
Dynamic: author-email
|
|
23
|
+
Dynamic: classifier
|
|
24
|
+
Dynamic: description
|
|
25
|
+
Dynamic: description-content-type
|
|
26
|
+
Dynamic: home-page
|
|
27
|
+
Dynamic: keywords
|
|
28
|
+
Dynamic: license
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-python
|
|
31
|
+
Dynamic: summary
|
|
32
|
+
|
|
33
|
+
[](https://github.com/pyconll/pyconll)
|
|
34
|
+
[](https://coveralls.io/github/pyconll/pyconll?branch=master)
|
|
35
|
+
[](https://pyconll.readthedocs.io/en/stable)
|
|
36
|
+
[](https://github.com/pyconll/pyconll/releases)
|
|
37
|
+
[](https://gitter.im/pyconll/pyconll?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
|
38
|
+
|
|
39
|
+
## pyconll
|
|
40
|
+
|
|
41
|
+
*Easily work with **CoNLL** files using the familiar syntax of **python**.*
|
|
42
|
+
|
|
43
|
+
<img src="res/logo.svg" width="256px" height="256px">
|
|
44
|
+
|
|
45
|
+
##### Links
|
|
46
|
+
- [Homepage](https://pyconll.github.io)
|
|
47
|
+
- [Documentation](https://pyconll.readthedocs.io/)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
### Installation
|
|
51
|
+
|
|
52
|
+
As with most python packages, simply use `pip` to install from PyPi.
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
pip install pyconll
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
`pyconll` is also available as a conda package on the `pyconll` channel. Only packages 2.2.0 and newer are available on conda at the moment.
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
conda install -c pyconll pyconll
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
pyconll supports Python 3.10 or newer. In general, pyconll will focus development efforts on officially supported python versions.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
### Use
|
|
68
|
+
|
|
69
|
+
This tool is intended to be a **minimal**, **low level**, **expressive** and **pragmatic** library in a widely used programming language. pyconll creates a thin API on top of raw CoNLL annotations that is simple and intuitive.
|
|
70
|
+
|
|
71
|
+
It offers the following features:
|
|
72
|
+
* Regular CI testing and validation against all UD v2.x versions.
|
|
73
|
+
* A strong domain model that includes CoNLL sources, Sentences, Tokens, Trees, etc.
|
|
74
|
+
* A typed API for better development experience and better semantics.
|
|
75
|
+
* A focus on usability and simplicity in design (no dependencies)
|
|
76
|
+
* Performance optimizations for a smooth development workflow no matter the dataset size (performs about 25%-35% faster than other comparable packages)
|
|
77
|
+
|
|
78
|
+
See the following code example to understand the basics of the API.
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
# This snippet finds sentences where a token marked with part of speech 'AUX' are
|
|
82
|
+
# governed by a NOUN. For example, in French this is a less common construction
|
|
83
|
+
# and we may want to validate these examples because we have previously found some
|
|
84
|
+
# problematic examples of this construction.
|
|
85
|
+
import pyconll
|
|
86
|
+
|
|
87
|
+
train = pyconll.load_from_file('./ud/train.conllu')
|
|
88
|
+
|
|
89
|
+
review_sentences = []
|
|
90
|
+
|
|
91
|
+
# Conll objects are iterable over their sentences, and sentences are iterable
|
|
92
|
+
# over their tokens. Sentences also de/serialize comment information.
|
|
93
|
+
for sentence in train:
|
|
94
|
+
for token in sentence:
|
|
95
|
+
|
|
96
|
+
# Tokens have attributes such as upos, head, id, deprel, etc, and sentences
|
|
97
|
+
# can be indexed by a token's id. We must check that the token is not the
|
|
98
|
+
# root token, whose id, '0', cannot be looked up.
|
|
99
|
+
if token.upos == 'AUX' and (token.head != '0' and sentence[token.head].upos == 'NOUN'):
|
|
100
|
+
review_sentences.append(sentence)
|
|
101
|
+
|
|
102
|
+
print('Review the following sentences:')
|
|
103
|
+
for sent in review_sentences:
|
|
104
|
+
print(sent.id)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
A full definition of the API can be found in the [documentation](https://pyconll.readthedocs.io/) or use the [quick start](https://pyconll.readthedocs.io/en/stable/starting.html) guide for a focused introduction.
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
### Uses and Limitations
|
|
111
|
+
|
|
112
|
+
This package edits CoNLL-U annotations. This does not include the annotated text itself. Word forms on Tokens are not editable and Sentence Tokens cannot be reassigned or reordered. `pyconll` focuses on editing CoNLL-U annotation rather than creating it or changing the underlying text that is annotated. If there is interest in this functionality area, please create a GitHub issue for more visibility.
|
|
113
|
+
|
|
114
|
+
This package also is only validated against the CoNLL-U format. The CoNLL and CoNLL-X format are not supported, but are very similar. I originally intended to support these formats as well, but their format is not as well defined as CoNLL-U so they are not included. Please create an issue for visibility if this feature interests you.
|
|
115
|
+
|
|
116
|
+
Lastly, linguistic data can often be very large and this package attempts to keep that in mind. pyconll provides methods for creating in memory conll objects along with an iterate only version in case a corpus is too large to store in memory (the size of the memory structure is several times larger than the actual corpus file). The iterate only version can parse upwards of 100,000 words per second on a 16gb ram machine, so for most datasets to be used on a local dev machine, this package will perform well. The 2.2.0 release also improves parse time and memory footprint by about 25%!
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
### Contributing
|
|
120
|
+
|
|
121
|
+
Contributions to this project are welcome and encouraged! If you are unsure how to contribute, here is a [guide](https://help.github.com/en/articles/creating-a-pull-request-from-a-fork) from Github explaining the basic workflow. After cloning this repo, please run `pip install -r requirements.txt` to properly setup locally. Some of these tools like yapf, pylint, and mypy do not have to be run locally, but CI builds will fail without their successful running. Some other release dependencies like twine and sphinx are also installed.
|
|
122
|
+
|
|
123
|
+
For packaging new versions, use setuptools version 24.2.0 or greater for creating the appropriate packaging that recognizes the `python_requires` metadata. Final packaging and release is now done with Github actions so this is less of a concern.
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
#### README and CHANGELOG
|
|
127
|
+
|
|
128
|
+
When changing either of these files, please change the Markdown version and run ``make gendocs`` so that the other versions stay in sync.
|
|
129
|
+
|
|
130
|
+
#### Release Checklist
|
|
131
|
+
|
|
132
|
+
Below enumerates the general release process explicitly. This section is for internal use and most people do not have to worry about this. First note, that the dev branch is always a direct extension of master with the latest changes since the last release. That is, it is essentially a staging release branch.
|
|
133
|
+
|
|
134
|
+
* Change the version in `pyconll/_version.py` appropriately.
|
|
135
|
+
* Merge dev into master **locally**. Github does not offer a fast forward merge and explicitly uses --no-ff. So to keep the linear nature of changes, merge locally to fast forward. This is assuming that the dev branch looks good on CI tests which do not automatically run in this situation.
|
|
136
|
+
* Push the master branch. This should start some CI tests specifically for master. After validating these results, create a tag corresponding to the next version number and push the tag.
|
|
137
|
+
* Create a new release from this tag from the [Releases page](https://github.com/pyconll/pyconll/releases). On creating this release, two workflows will start. One releases to pypi, and the other releases to conda.
|
|
138
|
+
* Validate these workflows pass, and the package is properly released on both platforms.
|
|
@@ -29,7 +29,7 @@ pip install pyconll
|
|
|
29
29
|
conda install -c pyconll pyconll
|
|
30
30
|
```
|
|
31
31
|
|
|
32
|
-
pyconll supports Python 3.
|
|
32
|
+
pyconll supports Python 3.10 or newer. In general, pyconll will focus development efforts on officially supported python versions.
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
### Use
|
|
@@ -6,13 +6,13 @@ functionalities.
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import os
|
|
9
|
-
from typing import Iterable, Iterator
|
|
9
|
+
from typing import Iterable, Iterator
|
|
10
10
|
|
|
11
11
|
from pyconll._parser import iter_sentences
|
|
12
12
|
from pyconll.unit.conll import Conll
|
|
13
13
|
from pyconll.unit.sentence import Sentence
|
|
14
14
|
|
|
15
|
-
PathLike =
|
|
15
|
+
PathLike = str | bytes | os.PathLike
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def load_from_string(source: str) -> Conll:
|
|
@@ -89,8 +89,7 @@ def iter_from_string(source: str) -> Iterator[Sentence]:
|
|
|
89
89
|
ParseError: If there is an error parsing the input into a Conll object.
|
|
90
90
|
"""
|
|
91
91
|
lines = source.splitlines()
|
|
92
|
-
|
|
93
|
-
yield sentence
|
|
92
|
+
yield from iter_sentences(lines)
|
|
94
93
|
|
|
95
94
|
|
|
96
95
|
def iter_from_file(file_descriptor: PathLike) -> Iterator[Sentence]:
|
|
@@ -109,8 +108,7 @@ def iter_from_file(file_descriptor: PathLike) -> Iterator[Sentence]:
|
|
|
109
108
|
ParseError: If there is an error parsing the input into a Conll object.
|
|
110
109
|
"""
|
|
111
110
|
with open(file_descriptor, encoding='utf-8') as f:
|
|
112
|
-
|
|
113
|
-
yield sentence
|
|
111
|
+
yield from iter_sentences(f)
|
|
114
112
|
|
|
115
113
|
|
|
116
114
|
def iter_from_resource(resource: Iterable[str]) -> Iterator[Sentence]:
|
|
@@ -131,5 +129,4 @@ def iter_from_resource(resource: Iterable[str]) -> Iterator[Sentence]:
|
|
|
131
129
|
Raises:
|
|
132
130
|
ParseError: If there is an error parsing the input into a Conll object.
|
|
133
131
|
"""
|
|
134
|
-
|
|
135
|
-
yield sentence
|
|
132
|
+
yield from iter_sentences(resource)
|
|
@@ -18,6 +18,7 @@ class Tree(Generic[T]):
|
|
|
18
18
|
module which is a sort of friend class of Tree to maintain its immutable
|
|
19
19
|
public contract.
|
|
20
20
|
"""
|
|
21
|
+
|
|
21
22
|
def __init__(self, data: T) -> None:
|
|
22
23
|
"""
|
|
23
24
|
Create a tree holding the value. Create a larger Tree, with TreeBuilder.
|
|
@@ -70,8 +71,7 @@ class Tree(Generic[T]):
|
|
|
70
71
|
"""
|
|
71
72
|
Provides an iterator over the children.
|
|
72
73
|
"""
|
|
73
|
-
|
|
74
|
-
yield child
|
|
74
|
+
yield from self._children
|
|
75
75
|
|
|
76
76
|
def __len__(self) -> int:
|
|
77
77
|
"""
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
Defines the Conll type and the associated parsing and output logic.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from typing import Any, Iterable, Iterator,
|
|
5
|
+
from typing import Any, Iterable, Iterator, MutableSequence, overload
|
|
6
6
|
|
|
7
7
|
import pyconll._parser
|
|
8
8
|
from pyconll.conllable import Conllable
|
|
@@ -17,6 +17,7 @@ class Conll(MutableSequence[Sentence], Conllable):
|
|
|
17
17
|
specifies that the file must end in a new line but that requirement is
|
|
18
18
|
relaxed here in parsing.
|
|
19
19
|
"""
|
|
20
|
+
|
|
20
21
|
def __init__(self, it: Iterable[str]) -> None:
|
|
21
22
|
"""
|
|
22
23
|
Create a CoNLL-U file collection of sentences.
|
|
@@ -28,7 +29,7 @@ class Conll(MutableSequence[Sentence], Conllable):
|
|
|
28
29
|
ParseError: If there is an error constructing the sentences in the
|
|
29
30
|
iterator.
|
|
30
31
|
"""
|
|
31
|
-
self._sentences:
|
|
32
|
+
self._sentences: list[Sentence] = []
|
|
32
33
|
|
|
33
34
|
for sentence in pyconll._parser.iter_sentences(it):
|
|
34
35
|
self._sentences.append(sentence)
|
|
@@ -99,8 +100,7 @@ class Conll(MutableSequence[Sentence], Conllable):
|
|
|
99
100
|
Yields:
|
|
100
101
|
An iterator over the sentences in this Conll object.
|
|
101
102
|
"""
|
|
102
|
-
|
|
103
|
-
yield sentence
|
|
103
|
+
yield from self._sentences
|
|
104
104
|
|
|
105
105
|
@overload
|
|
106
106
|
def __getitem__(self, key: int) -> Sentence:
|
|
@@ -156,7 +156,7 @@ class Conll(MutableSequence[Sentence], Conllable):
|
|
|
156
156
|
"""
|
|
157
157
|
self._sentences[key] = item
|
|
158
158
|
|
|
159
|
-
def __delitem__(self, key:
|
|
159
|
+
def __delitem__(self, key: int | slice) -> None:
|
|
160
160
|
"""
|
|
161
161
|
Delete the Sentence corresponding with the given key.
|
|
162
162
|
|
|
@@ -4,7 +4,7 @@ Defines the Sentence type and the associated parsing and output logic.
|
|
|
4
4
|
|
|
5
5
|
from collections import OrderedDict
|
|
6
6
|
import re
|
|
7
|
-
from typing import ClassVar,
|
|
7
|
+
from typing import ClassVar, Iterator, Optional, Sequence, overload
|
|
8
8
|
|
|
9
9
|
from pyconll.conllable import Conllable
|
|
10
10
|
from pyconll.exception import FormatError, ParseError
|
|
@@ -62,8 +62,8 @@ class Sentence(Sequence[Token], Conllable):
|
|
|
62
62
|
lines = source.split('\n')
|
|
63
63
|
|
|
64
64
|
self._meta: OrderedDict[str, Optional[str]] = OrderedDict() # pylint: disable=E1136
|
|
65
|
-
self._tokens:
|
|
66
|
-
self._ids_to_indexes:
|
|
65
|
+
self._tokens: list[Token] = []
|
|
66
|
+
self._ids_to_indexes: dict[str, int] = {}
|
|
67
67
|
|
|
68
68
|
for i, line in enumerate(lines):
|
|
69
69
|
if line:
|
|
@@ -212,7 +212,7 @@ class Sentence(Sequence[Token], Conllable):
|
|
|
212
212
|
ValueError: If the sentence can not be made into a tree because a
|
|
213
213
|
token has an empty head value or if there is no root token.
|
|
214
214
|
"""
|
|
215
|
-
children_tokens:
|
|
215
|
+
children_tokens: dict[str, list[Token]] = {}
|
|
216
216
|
|
|
217
217
|
for token in self:
|
|
218
218
|
if token.head is not None:
|
|
@@ -244,7 +244,7 @@ class Sentence(Sequence[Token], Conllable):
|
|
|
244
244
|
@staticmethod
|
|
245
245
|
def _create_tree_helper(builder: TreeBuilder, sentence: 'Sentence',
|
|
246
246
|
root: Token,
|
|
247
|
-
children_tokens:
|
|
247
|
+
children_tokens: dict[str, list[Token]]) -> None:
|
|
248
248
|
"""
|
|
249
249
|
Method to create a tree from a sentence given the root token.
|
|
250
250
|
|
|
@@ -303,8 +303,7 @@ class Sentence(Sequence[Token], Conllable):
|
|
|
303
303
|
Iterate through all the tokens in the Sentence including multiword
|
|
304
304
|
tokens.
|
|
305
305
|
"""
|
|
306
|
-
|
|
307
|
-
yield token
|
|
306
|
+
yield from self._tokens
|
|
308
307
|
|
|
309
308
|
@overload
|
|
310
309
|
def __getitem__(self, key: str) -> Token:
|
|
@@ -6,7 +6,7 @@ format.
|
|
|
6
6
|
|
|
7
7
|
import functools
|
|
8
8
|
import math
|
|
9
|
-
from typing import Callable, ClassVar,
|
|
9
|
+
from typing import Callable, ClassVar, Optional
|
|
10
10
|
|
|
11
11
|
from pyconll.conllable import Conllable
|
|
12
12
|
from pyconll.exception import FormatError, ParseError
|
|
@@ -78,7 +78,7 @@ def _create_dict_tupled_empty_parse(size, strict):
|
|
|
78
78
|
|
|
79
79
|
Args:
|
|
80
80
|
size: The expected size of the tuple.
|
|
81
|
-
strict: Flag to
|
|
81
|
+
strict: Flag to signify if parsed values with less components than size
|
|
82
82
|
will be accepted. In this case, missing values will be supplated
|
|
83
83
|
with None.
|
|
84
84
|
|
|
@@ -89,6 +89,7 @@ def _create_dict_tupled_empty_parse(size, strict):
|
|
|
89
89
|
ParseError: If the parsing is strict and there is a component size
|
|
90
90
|
mismatch, or if there are too many components in general.
|
|
91
91
|
"""
|
|
92
|
+
|
|
92
93
|
def _dict_tupled_empty_parser(v, v_delimiter):
|
|
93
94
|
"""
|
|
94
95
|
Map a value into the appropriate form, for a tupled based column.
|
|
@@ -123,7 +124,7 @@ def _create_dict_tupled_empty_parse(size, strict):
|
|
|
123
124
|
return _dict_tupled_empty_parser
|
|
124
125
|
|
|
125
126
|
|
|
126
|
-
TUPLE_PARSER_MEMOIZE:
|
|
127
|
+
TUPLE_PARSER_MEMOIZE: dict[int, Callable[[str, str], tuple[Optional[str],
|
|
127
128
|
...]]] = {}
|
|
128
129
|
|
|
129
130
|
|
|
@@ -237,7 +238,7 @@ def _dict_empty_map_helper(values, empty, delim, av_separator, v_delimiter,
|
|
|
237
238
|
if len(parts) == 1 or (len(parts) == 2 and parts[1] == ''):
|
|
238
239
|
k = parts[0]
|
|
239
240
|
v = None
|
|
240
|
-
|
|
241
|
+
else:
|
|
241
242
|
k, v = parts
|
|
242
243
|
|
|
243
244
|
parsed = parser(v, v_delimiter)
|
|
@@ -419,6 +420,7 @@ def _dict_conll_map_helper(values, empty, delim, av_separator, v_delimiter,
|
|
|
419
420
|
Returns:
|
|
420
421
|
The CoNLL-U formatted equivalent to the value.
|
|
421
422
|
"""
|
|
423
|
+
|
|
422
424
|
def paramed(pair):
|
|
423
425
|
f = formatter(pair[1], v_delimiter)
|
|
424
426
|
if f is None:
|
|
@@ -445,6 +447,7 @@ class _TokenIdComparer:
|
|
|
445
447
|
being compared by the start index and then by the end index, and decimal
|
|
446
448
|
ids having the radix separated parts compared separately.
|
|
447
449
|
"""
|
|
450
|
+
|
|
448
451
|
def __init__(self, token_id):
|
|
449
452
|
"""
|
|
450
453
|
Create the comparer wrapping the given, assumed valid format, id.
|
|
@@ -619,9 +622,9 @@ class Token(Conllable):
|
|
|
619
622
|
# Keys for sorting attribute-value columns. BY_ID converts the attribute
|
|
620
623
|
# value pair to the integer value of the attribute, and BY_CASE_SENSITIVE
|
|
621
624
|
# converts the pair to the lowercase version of the attribute.
|
|
622
|
-
BY_ID: ClassVar[Callable[[
|
|
625
|
+
BY_ID: ClassVar[Callable[[tuple[
|
|
623
626
|
str, str]], _TokenIdComparer]] = lambda pair: _TokenIdComparer(pair[0])
|
|
624
|
-
BY_CASE_INSENSITIVE: ClassVar[Callable[[
|
|
627
|
+
BY_CASE_INSENSITIVE: ClassVar[Callable[[tuple[
|
|
625
628
|
str, str]], str]] = lambda pair: pair[0].lower()
|
|
626
629
|
|
|
627
630
|
def __init__(self, source: str, empty: bool = False) -> None:
|
|
@@ -678,18 +681,18 @@ class Token(Conllable):
|
|
|
678
681
|
|
|
679
682
|
self.upos: Optional[str] = _unit_empty_map(fields[3], Token.EMPTY)
|
|
680
683
|
self.xpos: Optional[str] = _unit_empty_map(fields[4], Token.EMPTY)
|
|
681
|
-
self.feats:
|
|
682
|
-
|
|
684
|
+
self.feats: dict[str,
|
|
685
|
+
set[str]] = _dict_empty_map(fields[5], Token.EMPTY,
|
|
683
686
|
Token.COMPONENT_DELIMITER,
|
|
684
687
|
Token.AV_SEPARATOR,
|
|
685
688
|
Token.V_DELIMITER)
|
|
686
689
|
self.head: Optional[str] = _unit_empty_map(fields[6], Token.EMPTY)
|
|
687
690
|
self.deprel: Optional[str] = _unit_empty_map(fields[7], Token.EMPTY)
|
|
688
|
-
self.deps:
|
|
689
|
-
|
|
691
|
+
self.deps: dict[str,
|
|
692
|
+
tuple[str, str, str, str]] = _dict_tupled_empty_map(
|
|
690
693
|
fields[8], Token.EMPTY, Token.COMPONENT_DELIMITER,
|
|
691
694
|
Token.AV_DEPS_SEPARATOR, Token.V_DEPS_DELIMITER, 4)
|
|
692
|
-
self.misc:
|
|
695
|
+
self.misc: dict[str, Optional[set[str]]] = _dict_mixed_empty_map(
|
|
693
696
|
fields[9], Token.EMPTY, Token.COMPONENT_DELIMITER,
|
|
694
697
|
Token.AV_SEPARATOR, Token.V_DELIMITER)
|
|
695
698
|
|
|
@@ -5,7 +5,7 @@ collection of functions.
|
|
|
5
5
|
|
|
6
6
|
import functools
|
|
7
7
|
import itertools
|
|
8
|
-
from typing import Iterable, Iterator, Sequence
|
|
8
|
+
from typing import Iterable, Iterator, Sequence
|
|
9
9
|
|
|
10
10
|
from pyconll.unit.sentence import Sentence
|
|
11
11
|
from pyconll.unit.token import Token
|
|
@@ -15,7 +15,7 @@ def find_ngrams(
|
|
|
15
15
|
conll: Iterable[Sentence],
|
|
16
16
|
ngram: Sequence[str],
|
|
17
17
|
case_sensitive: bool = True
|
|
18
|
-
) -> Iterator[
|
|
18
|
+
) -> Iterator[tuple[Sentence, int, list[Token]]]:
|
|
19
19
|
"""
|
|
20
20
|
Find the occurrences of the ngram in the provided Conll collection.
|
|
21
21
|
|
|
@@ -77,7 +77,7 @@ def find_ngrams(
|
|
|
77
77
|
i += 1
|
|
78
78
|
|
|
79
79
|
|
|
80
|
-
def find_nonprojective_deps(sentence: Sentence) ->
|
|
80
|
+
def find_nonprojective_deps(sentence: Sentence) -> list[tuple[Token, Token]]:
|
|
81
81
|
"""
|
|
82
82
|
Find the nonprojective dependency pairs in the provided sentence.
|
|
83
83
|
|
|
@@ -95,7 +95,7 @@ def find_nonprojective_deps(sentence: Sentence) -> List[Tuple[Token, Token]]:
|
|
|
95
95
|
dependency pair.
|
|
96
96
|
"""
|
|
97
97
|
deps = _transform_tokens_to_sorted_dependency_arcs(sentence)
|
|
98
|
-
non_projective_deps:
|
|
98
|
+
non_projective_deps: list[tuple[int, int]] = []
|
|
99
99
|
|
|
100
100
|
openings = [-1]
|
|
101
101
|
closings = [len(sentence)]
|
|
@@ -169,6 +169,7 @@ class _DependencyComparer:
|
|
|
169
169
|
"""
|
|
170
170
|
Wrapper to compare dependency arcs.
|
|
171
171
|
"""
|
|
172
|
+
|
|
172
173
|
def __init__(self, dep):
|
|
173
174
|
"""
|
|
174
175
|
Creates the wrapper for this dependency.
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyconll
|
|
3
|
+
Version: 3.3.1
|
|
4
|
+
Summary: Read and manipulate CoNLL files
|
|
5
|
+
Home-page: https://github.com/pyconll/pyconll
|
|
6
|
+
Author: Matias Grioni
|
|
7
|
+
Author-email: matgrioni@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: nlp,conllu,conll,universal dependencies
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Education
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
15
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering
|
|
17
|
+
Classifier: Topic :: Utilities
|
|
18
|
+
Requires-Python: ~=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Dynamic: author
|
|
22
|
+
Dynamic: author-email
|
|
23
|
+
Dynamic: classifier
|
|
24
|
+
Dynamic: description
|
|
25
|
+
Dynamic: description-content-type
|
|
26
|
+
Dynamic: home-page
|
|
27
|
+
Dynamic: keywords
|
|
28
|
+
Dynamic: license
|
|
29
|
+
Dynamic: license-file
|
|
30
|
+
Dynamic: requires-python
|
|
31
|
+
Dynamic: summary
|
|
32
|
+
|
|
33
|
+
[](https://github.com/pyconll/pyconll)
|
|
34
|
+
[](https://coveralls.io/github/pyconll/pyconll?branch=master)
|
|
35
|
+
[](https://pyconll.readthedocs.io/en/stable)
|
|
36
|
+
[](https://github.com/pyconll/pyconll/releases)
|
|
37
|
+
[](https://gitter.im/pyconll/pyconll?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
|
38
|
+
|
|
39
|
+
## pyconll
|
|
40
|
+
|
|
41
|
+
*Easily work with **CoNLL** files using the familiar syntax of **python**.*
|
|
42
|
+
|
|
43
|
+
<img src="res/logo.svg" width="256px" height="256px">
|
|
44
|
+
|
|
45
|
+
##### Links
|
|
46
|
+
- [Homepage](https://pyconll.github.io)
|
|
47
|
+
- [Documentation](https://pyconll.readthedocs.io/)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
### Installation
|
|
51
|
+
|
|
52
|
+
As with most python packages, simply use `pip` to install from PyPi.
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
pip install pyconll
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
`pyconll` is also available as a conda package on the `pyconll` channel. Only packages 2.2.0 and newer are available on conda at the moment.
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
conda install -c pyconll pyconll
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
pyconll supports Python 3.10 or newer. In general, pyconll will focus development efforts on officially supported python versions.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
### Use
|
|
68
|
+
|
|
69
|
+
This tool is intended to be a **minimal**, **low level**, **expressive** and **pragmatic** library in a widely used programming language. pyconll creates a thin API on top of raw CoNLL annotations that is simple and intuitive.
|
|
70
|
+
|
|
71
|
+
It offers the following features:
|
|
72
|
+
* Regular CI testing and validation against all UD v2.x versions.
|
|
73
|
+
* A strong domain model that includes CoNLL sources, Sentences, Tokens, Trees, etc.
|
|
74
|
+
* A typed API for better development experience and better semantics.
|
|
75
|
+
* A focus on usability and simplicity in design (no dependencies)
|
|
76
|
+
* Performance optimizations for a smooth development workflow no matter the dataset size (performs about 25%-35% faster than other comparable packages)
|
|
77
|
+
|
|
78
|
+
See the following code example to understand the basics of the API.
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
# This snippet finds sentences where a token marked with part of speech 'AUX' are
|
|
82
|
+
# governed by a NOUN. For example, in French this is a less common construction
|
|
83
|
+
# and we may want to validate these examples because we have previously found some
|
|
84
|
+
# problematic examples of this construction.
|
|
85
|
+
import pyconll
|
|
86
|
+
|
|
87
|
+
train = pyconll.load_from_file('./ud/train.conllu')
|
|
88
|
+
|
|
89
|
+
review_sentences = []
|
|
90
|
+
|
|
91
|
+
# Conll objects are iterable over their sentences, and sentences are iterable
|
|
92
|
+
# over their tokens. Sentences also de/serialize comment information.
|
|
93
|
+
for sentence in train:
|
|
94
|
+
for token in sentence:
|
|
95
|
+
|
|
96
|
+
# Tokens have attributes such as upos, head, id, deprel, etc, and sentences
|
|
97
|
+
# can be indexed by a token's id. We must check that the token is not the
|
|
98
|
+
# root token, whose id, '0', cannot be looked up.
|
|
99
|
+
if token.upos == 'AUX' and (token.head != '0' and sentence[token.head].upos == 'NOUN'):
|
|
100
|
+
review_sentences.append(sentence)
|
|
101
|
+
|
|
102
|
+
print('Review the following sentences:')
|
|
103
|
+
for sent in review_sentences:
|
|
104
|
+
print(sent.id)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
A full definition of the API can be found in the [documentation](https://pyconll.readthedocs.io/) or use the [quick start](https://pyconll.readthedocs.io/en/stable/starting.html) guide for a focused introduction.
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
### Uses and Limitations
|
|
111
|
+
|
|
112
|
+
This package edits CoNLL-U annotations. This does not include the annotated text itself. Word forms on Tokens are not editable and Sentence Tokens cannot be reassigned or reordered. `pyconll` focuses on editing CoNLL-U annotation rather than creating it or changing the underlying text that is annotated. If there is interest in this functionality area, please create a GitHub issue for more visibility.
|
|
113
|
+
|
|
114
|
+
This package also is only validated against the CoNLL-U format. The CoNLL and CoNLL-X format are not supported, but are very similar. I originally intended to support these formats as well, but their format is not as well defined as CoNLL-U so they are not included. Please create an issue for visibility if this feature interests you.
|
|
115
|
+
|
|
116
|
+
Lastly, linguistic data can often be very large and this package attempts to keep that in mind. pyconll provides methods for creating in memory conll objects along with an iterate only version in case a corpus is too large to store in memory (the size of the memory structure is several times larger than the actual corpus file). The iterate only version can parse upwards of 100,000 words per second on a 16gb ram machine, so for most datasets to be used on a local dev machine, this package will perform well. The 2.2.0 release also improves parse time and memory footprint by about 25%!
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
### Contributing
|
|
120
|
+
|
|
121
|
+
Contributions to this project are welcome and encouraged! If you are unsure how to contribute, here is a [guide](https://help.github.com/en/articles/creating-a-pull-request-from-a-fork) from Github explaining the basic workflow. After cloning this repo, please run `pip install -r requirements.txt` to properly setup locally. Some of these tools like yapf, pylint, and mypy do not have to be run locally, but CI builds will fail without their successful running. Some other release dependencies like twine and sphinx are also installed.
|
|
122
|
+
|
|
123
|
+
For packaging new versions, use setuptools version 24.2.0 or greater for creating the appropriate packaging that recognizes the `python_requires` metadata. Final packaging and release is now done with Github actions so this is less of a concern.
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
#### README and CHANGELOG
|
|
127
|
+
|
|
128
|
+
When changing either of these files, please change the Markdown version and run ``make gendocs`` so that the other versions stay in sync.
|
|
129
|
+
|
|
130
|
+
#### Release Checklist
|
|
131
|
+
|
|
132
|
+
Below enumerates the general release process explicitly. This section is for internal use and most people do not have to worry about this. First note, that the dev branch is always a direct extension of master with the latest changes since the last release. That is, it is essentially a staging release branch.
|
|
133
|
+
|
|
134
|
+
* Change the version in `pyconll/_version.py` appropriately.
|
|
135
|
+
* Merge dev into master **locally**. Github does not offer a fast forward merge and explicitly uses --no-ff. So to keep the linear nature of changes, merge locally to fast forward. This is assuming that the dev branch looks good on CI tests which do not automatically run in this situation.
|
|
136
|
+
* Push the master branch. This should start some CI tests specifically for master. After validating these results, create a tag corresponding to the next version number and push the tag.
|
|
137
|
+
* Create a new release from this tag from the [Releases page](https://github.com/pyconll/pyconll/releases). On creating this release, two workflows will start. One releases to pypi, and the other releases to conda.
|
|
138
|
+
* Validate these workflows pass, and the package is properly released on both platforms.
|
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
LICENSE
|
|
2
2
|
MANIFEST.in
|
|
3
|
-
README
|
|
4
3
|
README.md
|
|
5
|
-
README.rst
|
|
6
4
|
setup.py
|
|
7
5
|
pyconll/__init__.py
|
|
8
6
|
pyconll/_parser.py
|
|
@@ -22,4 +20,7 @@ pyconll/tree/tree.py
|
|
|
22
20
|
pyconll/unit/__init__.py
|
|
23
21
|
pyconll/unit/conll.py
|
|
24
22
|
pyconll/unit/sentence.py
|
|
25
|
-
pyconll/unit/token.py
|
|
23
|
+
pyconll/unit/token.py
|
|
24
|
+
tests/test_conllable.py
|
|
25
|
+
tests/test_load.py
|
|
26
|
+
tests/test_util.py
|
|
@@ -22,20 +22,20 @@ setup(
|
|
|
22
22
|
packages = ['pyconll', 'pyconll.unit', 'pyconll.tree'],
|
|
23
23
|
version = parse.package_version(make_relative('pyconll/_version.py')),
|
|
24
24
|
description = 'Read and manipulate CoNLL files',
|
|
25
|
-
long_description = make_relative('README.
|
|
25
|
+
long_description = make_relative('README.md').read_text(),
|
|
26
|
+
long_description_content_type="text/markdown",
|
|
26
27
|
author = 'Matias Grioni',
|
|
27
28
|
author_email = 'matgrioni@gmail.com',
|
|
28
29
|
url = 'https://github.com/pyconll/pyconll',
|
|
29
30
|
license = 'MIT',
|
|
30
31
|
keywords = ['nlp', 'conllu', 'conll', 'universal dependencies'],
|
|
31
|
-
python_requires = '~=3.
|
|
32
|
+
python_requires = '~=3.10',
|
|
32
33
|
package_data = { 'pyconll': ['py.typed'] },
|
|
33
34
|
classifiers = [
|
|
34
35
|
'Development Status :: 5 - Production/Stable',
|
|
35
36
|
'Intended Audience :: Developers',
|
|
36
37
|
'Intended Audience :: Education',
|
|
37
38
|
'Intended Audience :: Science/Research',
|
|
38
|
-
'License :: OSI Approved :: MIT License',
|
|
39
39
|
'Programming Language :: Python :: 3 :: Only',
|
|
40
40
|
'Programming Language :: Python :: Implementation :: CPython',
|
|
41
41
|
'Topic :: Scientific/Engineering',
|