IUExtract 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- iuextract-1.0.0/LICENSE +33 -0
- iuextract-1.0.0/MANIFEST.in +1 -0
- iuextract-1.0.0/PKG-INFO +86 -0
- iuextract-1.0.0/README.md +33 -0
- iuextract-1.0.0/pyproject.toml +35 -0
- iuextract-1.0.0/setup.cfg +4 -0
- iuextract-1.0.0/setup.py +32 -0
- iuextract-1.0.0/src/IUExtract.egg-info/PKG-INFO +86 -0
- iuextract-1.0.0/src/IUExtract.egg-info/SOURCES.txt +23 -0
- iuextract-1.0.0/src/IUExtract.egg-info/dependency_links.txt +1 -0
- iuextract-1.0.0/src/IUExtract.egg-info/entry_points.txt +2 -0
- iuextract-1.0.0/src/IUExtract.egg-info/requires.txt +1 -0
- iuextract-1.0.0/src/IUExtract.egg-info/top_level.txt +1 -0
- iuextract-1.0.0/src/iuextract/__init__.py +0 -0
- iuextract-1.0.0/src/iuextract/__main__.py +42 -0
- iuextract-1.0.0/src/iuextract/data.py +325 -0
- iuextract-1.0.0/src/iuextract/extract.py +585 -0
- iuextract-1.0.0/src/iuextract/gold.py +123 -0
- iuextract-1.0.0/src/iuextract/iu_utils.py +245 -0
- iuextract-1.0.0/src/iuextract/transition_signals.txt +49 -0
iuextract-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
The Clear BSD License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 Marcello Gecchele,
|
|
4
|
+
Tokunaga Laboratory of Computational Linguistics, Tokyo Institute of Technology
|
|
5
|
+
All rights reserved.
|
|
6
|
+
|
|
7
|
+
Redistribution and use in source and binary forms, with or without
|
|
8
|
+
modification, are permitted (subject to the limitations in the disclaimer
|
|
9
|
+
below) provided that the following conditions are met:
|
|
10
|
+
|
|
11
|
+
* Redistributions of source code must retain the above copyright notice,
|
|
12
|
+
this list of conditions and the following disclaimer.
|
|
13
|
+
|
|
14
|
+
* Redistributions in binary form must reproduce the above copyright
|
|
15
|
+
notice, this list of conditions and the following disclaimer in the
|
|
16
|
+
documentation and/or other materials provided with the distribution.
|
|
17
|
+
|
|
18
|
+
* Neither the name of the copyright holder nor the names of its
|
|
19
|
+
contributors may be used to endorse or promote products derived from this
|
|
20
|
+
software without specific prior written permission.
|
|
21
|
+
|
|
22
|
+
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
|
|
23
|
+
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
|
24
|
+
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
25
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
26
|
+
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
|
27
|
+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
28
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
29
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
|
30
|
+
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
|
|
31
|
+
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
32
|
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
33
|
+
POSSIBILITY OF SUCH DAMAGE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
include src/iuextract/transition_signals.txt
|
iuextract-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: IUExtract
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Rule-based Idea Unit segmentation algorithm for the English language.
|
|
5
|
+
Home-page: https://github.com/TT-CL/iuextract
|
|
6
|
+
Author: Gecchele Marcello
|
|
7
|
+
Author-email: Marcello Gecchele <linked.uno@pm.me>
|
|
8
|
+
License: The Clear BSD License
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2022 Marcello Gecchele,
|
|
11
|
+
Tokunaga Laboratory of Computational Linguistics, Tokyo Institute of Technology
|
|
12
|
+
All rights reserved.
|
|
13
|
+
|
|
14
|
+
Redistribution and use in source and binary forms, with or without
|
|
15
|
+
modification, are permitted (subject to the limitations in the disclaimer
|
|
16
|
+
below) provided that the following conditions are met:
|
|
17
|
+
|
|
18
|
+
* Redistributions of source code must retain the above copyright notice,
|
|
19
|
+
this list of conditions and the following disclaimer.
|
|
20
|
+
|
|
21
|
+
* Redistributions in binary form must reproduce the above copyright
|
|
22
|
+
notice, this list of conditions and the following disclaimer in the
|
|
23
|
+
documentation and/or other materials provided with the distribution.
|
|
24
|
+
|
|
25
|
+
* Neither the name of the copyright holder nor the names of its
|
|
26
|
+
contributors may be used to endorse or promote products derived from this
|
|
27
|
+
software without specific prior written permission.
|
|
28
|
+
|
|
29
|
+
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
|
|
30
|
+
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
|
31
|
+
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
32
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
33
|
+
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
|
34
|
+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
35
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
36
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
|
37
|
+
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
|
|
38
|
+
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
39
|
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
40
|
+
POSSIBILITY OF SUCH DAMAGE.
|
|
41
|
+
Project-URL: Homepage, https://tt-cl.github.io/iu-resources/
|
|
42
|
+
Project-URL: Documentation, https://github.com/TT-CL/iuextract
|
|
43
|
+
Project-URL: Repository, https://github.com/TT-CL/iuextract.git
|
|
44
|
+
Project-URL: Issues, https://github.com/TT-CL/iuextract/issues
|
|
45
|
+
Keywords: Idea Unit,textual segmentation,segmentation,linguistics
|
|
46
|
+
Classifier: Programming Language :: Python :: 3
|
|
47
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
48
|
+
Classifier: Operating System :: OS Independent
|
|
49
|
+
Requires-Python: >=3.8
|
|
50
|
+
Description-Content-Type: text/markdown
|
|
51
|
+
License-File: LICENSE
|
|
52
|
+
Requires-Dist: spacy
|
|
53
|
+
|
|
54
|
+
# IUExtract
|
|
55
|
+
Rule-based Idea Unit segmentation algorithm for the English language.
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
First of all, you need to install the dependencies:
|
|
59
|
+
```
|
|
60
|
+
pip install spacy
|
|
61
|
+
python -m spacy download en_core_web_lg
|
|
62
|
+
```
|
|
63
|
+
To install the package with the command line tool [install pipx](https://pipx.pypa.io/latest/installation/) and run the following command:
|
|
64
|
+
```
|
|
65
|
+
pipx install iuextract
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
If you only wish to use the package in your python projects you can install without executable via
|
|
69
|
+
```
|
|
70
|
+
pip install iuextract
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Command Line Interface (CLI) Usage
|
|
74
|
+
Once installed via `pipx`, you can run
|
|
75
|
+
```
|
|
76
|
+
iuextract -i input_file.txt
|
|
77
|
+
```
|
|
78
|
+
to segment `file.txt` into Idea Units. The segmented file will be printed on the console as standard output.
|
|
79
|
+
You can specify an output file with the `-o` parameter.
|
|
80
|
+
```
|
|
81
|
+
iuextract -i input_file.txt -o output_file.txt
|
|
82
|
+
```
|
|
83
|
+
For more options you can call the help argument.
|
|
84
|
+
```
|
|
85
|
+
iuextract -h
|
|
86
|
+
```
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# IUExtract
|
|
2
|
+
Rule-based Idea Unit segmentation algorithm for the English language.
|
|
3
|
+
|
|
4
|
+
## Installation
|
|
5
|
+
First of all, you need to install the dependencies:
|
|
6
|
+
```
|
|
7
|
+
pip install spacy
|
|
8
|
+
python -m spacy download en_core_web_lg
|
|
9
|
+
```
|
|
10
|
+
To install the package with the command line tool [install pipx](https://pipx.pypa.io/latest/installation/) and run the following command:
|
|
11
|
+
```
|
|
12
|
+
pipx install iuextract
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
If you only wish to use the package in your python projects you can install without executable via
|
|
16
|
+
```
|
|
17
|
+
pip install iuextract
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Command Line Interface (CLI) Usage
|
|
21
|
+
Once installed via `pipx`, you can run
|
|
22
|
+
```
|
|
23
|
+
iuextract -i input_file.txt
|
|
24
|
+
```
|
|
25
|
+
to segment `file.txt` into Idea Units. The segmented file will be printed on the console as standard output.
|
|
26
|
+
You can specify an output file with the `-o` parameter.
|
|
27
|
+
```
|
|
28
|
+
iuextract -i input_file.txt -o output_file.txt
|
|
29
|
+
```
|
|
30
|
+
For more options you can call the help argument.
|
|
31
|
+
```
|
|
32
|
+
iuextract -h
|
|
33
|
+
```
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=42"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "IUExtract"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Rule-based Idea Unit segmentation algorithm for the English language."
|
|
9
|
+
readme = {file = "README.md", content-type = "text/markdown"}
|
|
10
|
+
license = {file = "LICENSE"}
|
|
11
|
+
keywords = ["Idea Unit", "textual segmentation", "segmentation", "linguistics"]
|
|
12
|
+
|
|
13
|
+
requires-python = '>=3.8'
|
|
14
|
+
|
|
15
|
+
dependencies = [
|
|
16
|
+
"spacy",
|
|
17
|
+
]
|
|
18
|
+
authors = [
|
|
19
|
+
{name = "Marcello Gecchele", email = "linked.uno@pm.me"},
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
classifiers = [
|
|
23
|
+
'Programming Language :: Python :: 3',
|
|
24
|
+
'License :: OSI Approved :: MIT License',
|
|
25
|
+
'Operating System :: OS Independent'
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
iuextract = "iuextract:__main__"
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Homepage = "https://tt-cl.github.io/iu-resources/"
|
|
33
|
+
Documentation = "https://github.com/TT-CL/iuextract"
|
|
34
|
+
Repository = "https://github.com/TT-CL/iuextract.git"
|
|
35
|
+
Issues = "https://github.com/TT-CL/iuextract/issues"
|
iuextract-1.0.0/setup.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import setuptools
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
|
+
long_description = fh.read()
|
|
5
|
+
|
|
6
|
+
setuptools.setup(
|
|
7
|
+
name="iuextract",
|
|
8
|
+
version="0.0.10",
|
|
9
|
+
author="Gecchele Marcello",
|
|
10
|
+
author_email="git@gecchele.dev",
|
|
11
|
+
description="Extract Idea Units from strings and files",
|
|
12
|
+
long_description=long_description,
|
|
13
|
+
long_description_content_type="text/markdown",
|
|
14
|
+
url="https://github.com/TT-CL/iuextract",
|
|
15
|
+
|
|
16
|
+
project_urls={
|
|
17
|
+
"Bug Tracker": "https://github.com/TT-CL/iuextract/issues",
|
|
18
|
+
},
|
|
19
|
+
classifiers=[
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"License :: OSI Approved :: MIT License",
|
|
22
|
+
"Operating System :: OS Independent",
|
|
23
|
+
],
|
|
24
|
+
package_dir={"": "src"},
|
|
25
|
+
packages=setuptools.find_packages(where="src"),
|
|
26
|
+
include_package_data=True,
|
|
27
|
+
python_requires=">=3.8",
|
|
28
|
+
install_requires = [
|
|
29
|
+
'setuptools',
|
|
30
|
+
'spacy>=3.0.0',
|
|
31
|
+
]
|
|
32
|
+
)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: IUExtract
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Rule-based Idea Unit segmentation algorithm for the English language.
|
|
5
|
+
Home-page: https://github.com/TT-CL/iuextract
|
|
6
|
+
Author: Gecchele Marcello
|
|
7
|
+
Author-email: Marcello Gecchele <linked.uno@pm.me>
|
|
8
|
+
License: The Clear BSD License
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2022 Marcello Gecchele,
|
|
11
|
+
Tokunaga Laboratory of Computational Linguistics, Tokyo Institute of Technology
|
|
12
|
+
All rights reserved.
|
|
13
|
+
|
|
14
|
+
Redistribution and use in source and binary forms, with or without
|
|
15
|
+
modification, are permitted (subject to the limitations in the disclaimer
|
|
16
|
+
below) provided that the following conditions are met:
|
|
17
|
+
|
|
18
|
+
* Redistributions of source code must retain the above copyright notice,
|
|
19
|
+
this list of conditions and the following disclaimer.
|
|
20
|
+
|
|
21
|
+
* Redistributions in binary form must reproduce the above copyright
|
|
22
|
+
notice, this list of conditions and the following disclaimer in the
|
|
23
|
+
documentation and/or other materials provided with the distribution.
|
|
24
|
+
|
|
25
|
+
* Neither the name of the copyright holder nor the names of its
|
|
26
|
+
contributors may be used to endorse or promote products derived from this
|
|
27
|
+
software without specific prior written permission.
|
|
28
|
+
|
|
29
|
+
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
|
|
30
|
+
THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
|
31
|
+
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
32
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
33
|
+
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
|
34
|
+
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
35
|
+
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
36
|
+
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
|
37
|
+
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
|
|
38
|
+
IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
39
|
+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
40
|
+
POSSIBILITY OF SUCH DAMAGE.
|
|
41
|
+
Project-URL: Homepage, https://tt-cl.github.io/iu-resources/
|
|
42
|
+
Project-URL: Documentation, https://github.com/TT-CL/iuextract
|
|
43
|
+
Project-URL: Repository, https://github.com/TT-CL/iuextract.git
|
|
44
|
+
Project-URL: Issues, https://github.com/TT-CL/iuextract/issues
|
|
45
|
+
Keywords: Idea Unit,textual segmentation,segmentation,linguistics
|
|
46
|
+
Classifier: Programming Language :: Python :: 3
|
|
47
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
48
|
+
Classifier: Operating System :: OS Independent
|
|
49
|
+
Requires-Python: >=3.8
|
|
50
|
+
Description-Content-Type: text/markdown
|
|
51
|
+
License-File: LICENSE
|
|
52
|
+
Requires-Dist: spacy
|
|
53
|
+
|
|
54
|
+
# IUExtract
|
|
55
|
+
Rule-based Idea Unit segmentation algorithm for the English language.
|
|
56
|
+
|
|
57
|
+
## Installation
|
|
58
|
+
First of all, you need to install the dependencies:
|
|
59
|
+
```
|
|
60
|
+
pip install spacy
|
|
61
|
+
python -m spacy download en_core_web_lg
|
|
62
|
+
```
|
|
63
|
+
To install the package with the command line tool [install pipx](https://pipx.pypa.io/latest/installation/) and run the following command:
|
|
64
|
+
```
|
|
65
|
+
pipx install iuextract
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
If you only wish to use the package in your python projects you can install without executable via
|
|
69
|
+
```
|
|
70
|
+
pip install iuextract
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Command Line Interface (CLI) Usage
|
|
74
|
+
Once installed via `pipx`, you can run
|
|
75
|
+
```
|
|
76
|
+
iuextract -i input_file.txt
|
|
77
|
+
```
|
|
78
|
+
to segment `file.txt` into Idea Units. The segmented file will be printed on the console as standard output.
|
|
79
|
+
You can specify an output file with the `-o` parameter.
|
|
80
|
+
```
|
|
81
|
+
iuextract -i input_file.txt -o output_file.txt
|
|
82
|
+
```
|
|
83
|
+
For more options you can call the help argument.
|
|
84
|
+
```
|
|
85
|
+
iuextract -h
|
|
86
|
+
```
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
setup.py
|
|
6
|
+
src/IUExtract.egg-info/PKG-INFO
|
|
7
|
+
src/IUExtract.egg-info/SOURCES.txt
|
|
8
|
+
src/IUExtract.egg-info/dependency_links.txt
|
|
9
|
+
src/IUExtract.egg-info/entry_points.txt
|
|
10
|
+
src/IUExtract.egg-info/requires.txt
|
|
11
|
+
src/IUExtract.egg-info/top_level.txt
|
|
12
|
+
src/iuextract/__init__.py
|
|
13
|
+
src/iuextract/__main__.py
|
|
14
|
+
src/iuextract/data.py
|
|
15
|
+
src/iuextract/extract.py
|
|
16
|
+
src/iuextract/gold.py
|
|
17
|
+
src/iuextract/iu_utils.py
|
|
18
|
+
src/iuextract/transition_signals.txt
|
|
19
|
+
src/iuextract.egg-info/PKG-INFO
|
|
20
|
+
src/iuextract.egg-info/SOURCES.txt
|
|
21
|
+
src/iuextract.egg-info/dependency_links.txt
|
|
22
|
+
src/iuextract.egg-info/requires.txt
|
|
23
|
+
src/iuextract.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
spacy
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
iuextract
|
|
File without changes
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
import warnings
|
|
4
|
+
from .data import import_file
|
|
5
|
+
from .extract import label_ius
|
|
6
|
+
from .iu_utils import iuDoc2str
|
|
7
|
+
import spacy
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
parser = argparse.ArgumentParser(prog='iuextract', description='Segment a raw text into Idea Units')
|
|
11
|
+
parser.add_argument('-i', '--input', help='the {i}nput text. If it is not provided, the program will read the positional arguments input as a text.', type=argparse.FileType('r'), required=False, default=None)
|
|
12
|
+
parser.add_argument('-o', '--output', help="the {o}utput file where to store the ius. By default the segmentation will be shown on the terminal and will not be stored on disk.", nargs='?', type=argparse.FileType('w'), default=sys.stdout)
|
|
13
|
+
parser.add_argument('pos_input', help="the input text to analyze if no filename is provided.", nargs='*')
|
|
14
|
+
#parser.add_argument('-o', '--output', help="the {o}utput file where to store the ius. By default the segmentation will be shown on the terminal and will not be stored on disk. \nAccepted filetype: .txt", required=False, default=None)
|
|
15
|
+
parser.add_argument('-b', '--before', help="a sequence of text to place {b}efore each IU. By default, no prefix is set.", required=False, default="")
|
|
16
|
+
parser.add_argument('-s', '--separator', help="a sequence of text to {s}eparate the index from the IU. By default, the separator is the character | .", required=False, default="|")
|
|
17
|
+
parser.add_argument('-a', '--after', help="a sequence of text to place {a}fter each IU. By default, the suffix is the newline character.", required=False, default="\n")
|
|
18
|
+
|
|
19
|
+
args = parser.parse_args()
|
|
20
|
+
input_file = args.input
|
|
21
|
+
text_input = args.pos_input
|
|
22
|
+
text_input = ' '.join(text_input) if len(text_input) > 0 else None
|
|
23
|
+
if input_file is None and text_input is None:
|
|
24
|
+
raise IOError("Please provide a valid input text. Run iuextract -h for more help.")
|
|
25
|
+
if input_file is not None and text_input is not None:
|
|
26
|
+
input_warning = "WARNING: the program detected both an input file passed via the -i argument and input text via positional arguments. Ignoring the positional arguments and only processing the input file."
|
|
27
|
+
warnings.warn(input_warning)
|
|
28
|
+
raw_input = text_input
|
|
29
|
+
if input_file is not None:
|
|
30
|
+
raw_input = input_file.read()
|
|
31
|
+
outputFile = args.output
|
|
32
|
+
prefix = args.before
|
|
33
|
+
suffix = args.after
|
|
34
|
+
separator = args.separator
|
|
35
|
+
|
|
36
|
+
nlp = spacy.load("en_core_web_lg")
|
|
37
|
+
parsed = import_file(raw_input, nlp=nlp)
|
|
38
|
+
label_ius(parsed)
|
|
39
|
+
raw_output = iuDoc2str(parsed,gold=False, index_sep=separator, opener=prefix, closer=suffix)
|
|
40
|
+
if not raw_output.endswith('\n'):
|
|
41
|
+
raw_output = f'{raw_output}\n'
|
|
42
|
+
outputFile.write(raw_output)
|