IUExtract 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ The Clear BSD License
2
+
3
+ Copyright (c) 2022 Marcello Gecchele,
4
+ Tokunaga Laboratory of Computational Linguistics, Tokyo Institute of Technology
5
+ All rights reserved.
6
+
7
+ Redistribution and use in source and binary forms, with or without
8
+ modification, are permitted (subject to the limitations in the disclaimer
9
+ below) provided that the following conditions are met:
10
+
11
+ * Redistributions of source code must retain the above copyright notice,
12
+ this list of conditions and the following disclaimer.
13
+
14
+ * Redistributions in binary form must reproduce the above copyright
15
+ notice, this list of conditions and the following disclaimer in the
16
+ documentation and/or other materials provided with the distribution.
17
+
18
+ * Neither the name of the copyright holder nor the names of its
19
+ contributors may be used to endorse or promote products derived from this
20
+ software without specific prior written permission.
21
+
22
+ NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
23
+ THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
24
+ CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
26
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
27
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
30
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
31
+ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33
+ POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1 @@
1
+ include src/iuextract/transition_signals.txt
@@ -0,0 +1,86 @@
1
+ Metadata-Version: 2.1
2
+ Name: IUExtract
3
+ Version: 1.0.0
4
+ Summary: Rule-based Idea Unit segmentation algorithm for the English language.
5
+ Home-page: https://github.com/TT-CL/iuextract
6
+ Author: Gecchele Marcello
7
+ Author-email: Marcello Gecchele <linked.uno@pm.me>
8
+ License: The Clear BSD License
9
+
10
+ Copyright (c) 2022 Marcello Gecchele,
11
+ Tokunaga Laboratory of Computational Linguistics, Tokyo Institute of Technology
12
+ All rights reserved.
13
+
14
+ Redistribution and use in source and binary forms, with or without
15
+ modification, are permitted (subject to the limitations in the disclaimer
16
+ below) provided that the following conditions are met:
17
+
18
+ * Redistributions of source code must retain the above copyright notice,
19
+ this list of conditions and the following disclaimer.
20
+
21
+ * Redistributions in binary form must reproduce the above copyright
22
+ notice, this list of conditions and the following disclaimer in the
23
+ documentation and/or other materials provided with the distribution.
24
+
25
+ * Neither the name of the copyright holder nor the names of its
26
+ contributors may be used to endorse or promote products derived from this
27
+ software without specific prior written permission.
28
+
29
+ NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
30
+ THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31
+ CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
32
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
33
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
34
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
35
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
36
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
37
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
38
+ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
39
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
40
+ POSSIBILITY OF SUCH DAMAGE.
41
+ Project-URL: Homepage, https://tt-cl.github.io/iu-resources/
42
+ Project-URL: Documentation, https://github.com/TT-CL/iuextract
43
+ Project-URL: Repository, https://github.com/TT-CL/iuextract.git
44
+ Project-URL: Issues, https://github.com/TT-CL/iuextract/issues
45
+ Keywords: Idea Unit,textual segmentation,segmentation,linguistics
46
+ Classifier: Programming Language :: Python :: 3
47
+ Classifier: License :: OSI Approved :: MIT License
48
+ Classifier: Operating System :: OS Independent
49
+ Requires-Python: >=3.8
50
+ Description-Content-Type: text/markdown
51
+ License-File: LICENSE
52
+ Requires-Dist: spacy
53
+
54
+ # IUExtract
55
+ Rule-based Idea Unit segmentation algorithm for the English language.
56
+
57
+ ## Installation
58
+ First of all, you need to install the dependencies:
59
+ ```
60
+ pip install spacy
61
+ python -m spacy download en_core_web_lg
62
+ ```
63
+ To install the package with the command line tool [install pipx](https://pipx.pypa.io/latest/installation/) and run the following command:
64
+ ```
65
+ pipx install iuextract
66
+ ```
67
+
68
+ If you only wish to use the package in your python projects you can install without executable via
69
+ ```
70
+ pip install iuextract
71
+ ```
72
+
73
+ ## Command Line Interface (CLI) Usage
74
+ Once installed via `pipx`, you can run
75
+ ```
76
+ iuextract -i input_file.txt
77
+ ```
78
+ to segment `file.txt` into Idea Units. The segmented file will be printed on the console as standard output.
79
+ You can specify an output file with the `-o` parameter.
80
+ ```
81
+ iuextract -i input_file.txt -o output_file.txt
82
+ ```
83
+ For more options you can call the help argument.
84
+ ```
85
+ iuextract -h
86
+ ```
@@ -0,0 +1,33 @@
1
+ # IUExtract
2
+ Rule-based Idea Unit segmentation algorithm for the English language.
3
+
4
+ ## Installation
5
+ First of all, you need to install the dependencies:
6
+ ```
7
+ pip install spacy
8
+ python -m spacy download en_core_web_lg
9
+ ```
10
+ To install the package with the command line tool [install pipx](https://pipx.pypa.io/latest/installation/) and run the following command:
11
+ ```
12
+ pipx install iuextract
13
+ ```
14
+
15
+ If you only wish to use the package in your python projects you can install without executable via
16
+ ```
17
+ pip install iuextract
18
+ ```
19
+
20
+ ## Command Line Interface (CLI) Usage
21
+ Once installed via `pipx`, you can run
22
+ ```
23
+ iuextract -i input_file.txt
24
+ ```
25
+ to segment `file.txt` into Idea Units. The segmented file will be printed on the console as standard output.
26
+ You can specify an output file with the `-o` parameter.
27
+ ```
28
+ iuextract -i input_file.txt -o output_file.txt
29
+ ```
30
+ For more options you can call the help argument.
31
+ ```
32
+ iuextract -h
33
+ ```
@@ -0,0 +1,35 @@
1
+ [build-system]
2
+ requires = ["setuptools>=42"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "IUExtract"
7
+ version = "1.0.0"
8
+ description = "Rule-based Idea Unit segmentation algorithm for the English language."
9
+ readme = {file = "README.md", content-type = "text/markdown"}
10
+ license = {file = "LICENSE"}
11
+ keywords = ["Idea Unit", "textual segmentation", "segmentation", "linguistics"]
12
+
13
+ requires-python = '>=3.8'
14
+
15
+ dependencies = [
16
+ "spacy",
17
+ ]
18
+ authors = [
19
+ {name = "Marcello Gecchele", email = "linked.uno@pm.me"},
20
+ ]
21
+
22
+ classifiers = [
23
+ 'Programming Language :: Python :: 3',
24
+ 'License :: OSI Approved :: MIT License',
25
+ 'Operating System :: OS Independent'
26
+ ]
27
+
28
+ [project.scripts]
29
+ iuextract = "iuextract:__main__"
30
+
31
+ [project.urls]
32
+ Homepage = "https://tt-cl.github.io/iu-resources/"
33
+ Documentation = "https://github.com/TT-CL/iuextract"
34
+ Repository = "https://github.com/TT-CL/iuextract.git"
35
+ Issues = "https://github.com/TT-CL/iuextract/issues"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,32 @@
1
+ import setuptools
2
+
3
+ with open("README.md", "r", encoding="utf-8") as fh:
4
+ long_description = fh.read()
5
+
6
+ setuptools.setup(
7
+ name="iuextract",
8
+ version="0.0.10",
9
+ author="Gecchele Marcello",
10
+ author_email="git@gecchele.dev",
11
+ description="Extract Idea Units from strings and files",
12
+ long_description=long_description,
13
+ long_description_content_type="text/markdown",
14
+ url="https://github.com/TT-CL/iuextract",
15
+
16
+ project_urls={
17
+ "Bug Tracker": "https://github.com/TT-CL/iuextract/issues",
18
+ },
19
+ classifiers=[
20
+ "Programming Language :: Python :: 3",
21
+ "License :: OSI Approved :: MIT License",
22
+ "Operating System :: OS Independent",
23
+ ],
24
+ package_dir={"": "src"},
25
+ packages=setuptools.find_packages(where="src"),
26
+ include_package_data=True,
27
+ python_requires=">=3.8",
28
+ install_requires = [
29
+ 'setuptools',
30
+ 'spacy>=3.0.0',
31
+ ]
32
+ )
@@ -0,0 +1,86 @@
1
+ Metadata-Version: 2.1
2
+ Name: IUExtract
3
+ Version: 1.0.0
4
+ Summary: Rule-based Idea Unit segmentation algorithm for the English language.
5
+ Home-page: https://github.com/TT-CL/iuextract
6
+ Author: Gecchele Marcello
7
+ Author-email: Marcello Gecchele <linked.uno@pm.me>
8
+ License: The Clear BSD License
9
+
10
+ Copyright (c) 2022 Marcello Gecchele,
11
+ Tokunaga Laboratory of Computational Linguistics, Tokyo Institute of Technology
12
+ All rights reserved.
13
+
14
+ Redistribution and use in source and binary forms, with or without
15
+ modification, are permitted (subject to the limitations in the disclaimer
16
+ below) provided that the following conditions are met:
17
+
18
+ * Redistributions of source code must retain the above copyright notice,
19
+ this list of conditions and the following disclaimer.
20
+
21
+ * Redistributions in binary form must reproduce the above copyright
22
+ notice, this list of conditions and the following disclaimer in the
23
+ documentation and/or other materials provided with the distribution.
24
+
25
+ * Neither the name of the copyright holder nor the names of its
26
+ contributors may be used to endorse or promote products derived from this
27
+ software without specific prior written permission.
28
+
29
+ NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED BY
30
+ THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31
+ CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
32
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
33
+ PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
34
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
35
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
36
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
37
+ BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
38
+ IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
39
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
40
+ POSSIBILITY OF SUCH DAMAGE.
41
+ Project-URL: Homepage, https://tt-cl.github.io/iu-resources/
42
+ Project-URL: Documentation, https://github.com/TT-CL/iuextract
43
+ Project-URL: Repository, https://github.com/TT-CL/iuextract.git
44
+ Project-URL: Issues, https://github.com/TT-CL/iuextract/issues
45
+ Keywords: Idea Unit,textual segmentation,segmentation,linguistics
46
+ Classifier: Programming Language :: Python :: 3
47
+ Classifier: License :: OSI Approved :: MIT License
48
+ Classifier: Operating System :: OS Independent
49
+ Requires-Python: >=3.8
50
+ Description-Content-Type: text/markdown
51
+ License-File: LICENSE
52
+ Requires-Dist: spacy
53
+
54
+ # IUExtract
55
+ Rule-based Idea Unit segmentation algorithm for the English language.
56
+
57
+ ## Installation
58
+ First of all, you need to install the dependencies:
59
+ ```
60
+ pip install spacy
61
+ python -m spacy download en_core_web_lg
62
+ ```
63
+ To install the package with the command line tool [install pipx](https://pipx.pypa.io/latest/installation/) and run the following command:
64
+ ```
65
+ pipx install iuextract
66
+ ```
67
+
68
+ If you only wish to use the package in your python projects you can install without executable via
69
+ ```
70
+ pip install iuextract
71
+ ```
72
+
73
+ ## Command Line Interface (CLI) Usage
74
+ Once installed via `pipx`, you can run
75
+ ```
76
+ iuextract -i input_file.txt
77
+ ```
78
+ to segment `file.txt` into Idea Units. The segmented file will be printed on the console as standard output.
79
+ You can specify an output file with the `-o` parameter.
80
+ ```
81
+ iuextract -i input_file.txt -o output_file.txt
82
+ ```
83
+ For more options you can call the help argument.
84
+ ```
85
+ iuextract -h
86
+ ```
@@ -0,0 +1,23 @@
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ pyproject.toml
5
+ setup.py
6
+ src/IUExtract.egg-info/PKG-INFO
7
+ src/IUExtract.egg-info/SOURCES.txt
8
+ src/IUExtract.egg-info/dependency_links.txt
9
+ src/IUExtract.egg-info/entry_points.txt
10
+ src/IUExtract.egg-info/requires.txt
11
+ src/IUExtract.egg-info/top_level.txt
12
+ src/iuextract/__init__.py
13
+ src/iuextract/__main__.py
14
+ src/iuextract/data.py
15
+ src/iuextract/extract.py
16
+ src/iuextract/gold.py
17
+ src/iuextract/iu_utils.py
18
+ src/iuextract/transition_signals.txt
19
+ src/iuextract.egg-info/PKG-INFO
20
+ src/iuextract.egg-info/SOURCES.txt
21
+ src/iuextract.egg-info/dependency_links.txt
22
+ src/iuextract.egg-info/requires.txt
23
+ src/iuextract.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ iuextract = iuextract:__main__
@@ -0,0 +1 @@
1
+ spacy
@@ -0,0 +1 @@
1
+ iuextract
File without changes
@@ -0,0 +1,42 @@
1
+ import argparse
2
+ import sys
3
+ import warnings
4
+ from .data import import_file
5
+ from .extract import label_ius
6
+ from .iu_utils import iuDoc2str
7
+ import spacy
8
+
9
+
10
+ parser = argparse.ArgumentParser(prog='iuextract', description='Segment a raw text into Idea Units')
11
+ parser.add_argument('-i', '--input', help='the {i}nput text. If it is not provided, the program will read the positional arguments input as a text.', type=argparse.FileType('r'), required=False, default=None)
12
+ parser.add_argument('-o', '--output', help="the {o}utput file where to store the ius. By default the segmentation will be shown on the terminal and will not be stored on disk.", nargs='?', type=argparse.FileType('w'), default=sys.stdout)
13
+ parser.add_argument('pos_input', help="the input text to analyze if no filename is provided.", nargs='*')
14
+ #parser.add_argument('-o', '--output', help="the {o}utput file where to store the ius. By default the segmentation will be shown on the terminal and will not be stored on disk. \nAccepted filetype: .txt", required=False, default=None)
15
+ parser.add_argument('-b', '--before', help="a sequence of text to place {b}efore each IU. By default, no prefix is set.", required=False, default="")
16
+ parser.add_argument('-s', '--separator', help="a sequence of text to {s}eparate the index from the IU. By default, the separator is the character | .", required=False, default="|")
17
+ parser.add_argument('-a', '--after', help="a sequence of text to place {a}fter each IU. By default, the suffix is the newline character.", required=False, default="\n")
18
+
19
+ args = parser.parse_args()
20
+ input_file = args.input
21
+ text_input = args.pos_input
22
+ text_input = ' '.join(text_input) if len(text_input) > 0 else None
23
+ if input_file is None and text_input is None:
24
+ raise IOError("Please provide a valid input text. Run iuextract -h for more help.")
25
+ if input_file is not None and text_input is not None:
26
+ input_warning = "WARNING: the program detected both an input file passed via the -i argument and input text via positional arguments. Ignoring the positional arguments and only processing the input file."
27
+ warnings.warn(input_warning)
28
+ raw_input = text_input
29
+ if input_file is not None:
30
+ raw_input = input_file.read()
31
+ outputFile = args.output
32
+ prefix = args.before
33
+ suffix = args.after
34
+ separator = args.separator
35
+
36
+ nlp = spacy.load("en_core_web_lg")
37
+ parsed = import_file(raw_input, nlp=nlp)
38
+ label_ius(parsed)
39
+ raw_output = iuDoc2str(parsed,gold=False, index_sep=separator, opener=prefix, closer=suffix)
40
+ if not raw_output.endswith('\n'):
41
+ raw_output = f'{raw_output}\n'
42
+ outputFile.write(raw_output)