pelican-nlp 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pelican_nlp/_version.py +1 -1
- pelican_nlp/cli.py +4 -0
- pelican_nlp/extraction/language_model.py +26 -2
- pelican_nlp/main.py +4 -2
- pelican_nlp/sample_configuration_files/config_general.yml +2 -2
- {pelican_nlp-0.2.5.dist-info → pelican_nlp-0.2.7.dist-info}/METADATA +21 -7
- {pelican_nlp-0.2.5.dist-info → pelican_nlp-0.2.7.dist-info}/RECORD +11 -11
- {pelican_nlp-0.2.5.dist-info → pelican_nlp-0.2.7.dist-info}/WHEEL +0 -0
- {pelican_nlp-0.2.5.dist-info → pelican_nlp-0.2.7.dist-info}/entry_points.txt +0 -0
- {pelican_nlp-0.2.5.dist-info → pelican_nlp-0.2.7.dist-info}/licenses/LICENSE +0 -0
- {pelican_nlp-0.2.5.dist-info → pelican_nlp-0.2.7.dist-info}/top_level.txt +0 -0
pelican_nlp/_version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.
|
1
|
+
__version__ = "0.2.7"
|
pelican_nlp/cli.py
CHANGED
@@ -7,6 +7,10 @@ def main():
|
|
7
7
|
print("No .yml configuration file found in the current directory.")
|
8
8
|
return
|
9
9
|
|
10
|
+
if len(config_files) > 1:
|
11
|
+
print("More than one configuration file found - remove unneeded files from project directory")
|
12
|
+
return
|
13
|
+
|
10
14
|
config_file = config_files[0] # You could also add logic to choose or validate
|
11
15
|
print(f"Using configuration file: {config_file}")
|
12
16
|
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import torch
|
2
2
|
import psutil
|
3
|
+
import os
|
3
4
|
|
4
5
|
from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
|
5
6
|
from transformers import AutoModelForCausalLM
|
@@ -17,8 +18,31 @@ class Model:
|
|
17
18
|
if self.model_name == 'fastText':
|
18
19
|
import fasttext
|
19
20
|
import fasttext.util
|
20
|
-
|
21
|
-
|
21
|
+
|
22
|
+
# Create a model directory if it doesn't exist
|
23
|
+
model_dir = os.path.join(os.path.expanduser('~'), '.fasttext')
|
24
|
+
os.makedirs(model_dir, exist_ok=True)
|
25
|
+
|
26
|
+
# Set the model path using proper OS path joining
|
27
|
+
model_path = os.path.join(model_dir, 'cc.de.300.bin')
|
28
|
+
|
29
|
+
# Download only if model doesn't exist
|
30
|
+
if not os.path.exists(model_path):
|
31
|
+
try:
|
32
|
+
fasttext.util.download_model('de', if_exists='ignore')
|
33
|
+
except OSError:
|
34
|
+
# Direct download fallback for Windows
|
35
|
+
import urllib.request
|
36
|
+
url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz'
|
37
|
+
urllib.request.urlretrieve(url, model_path + '.gz')
|
38
|
+
# Decompress the file
|
39
|
+
import gzip
|
40
|
+
with gzip.open(model_path + '.gz', 'rb') as f_in:
|
41
|
+
with open(model_path, 'wb') as f_out:
|
42
|
+
f_out.write(f_in.read())
|
43
|
+
os.remove(model_path + '.gz')
|
44
|
+
|
45
|
+
self.model_instance = fasttext.load_model(model_path)
|
22
46
|
print('FastText model loaded.')
|
23
47
|
elif self.model_name == 'xlm-roberta-base':
|
24
48
|
from transformers import AutoModel
|
pelican_nlp/main.py
CHANGED
@@ -24,6 +24,8 @@ from pelican_nlp.core import Corpus
|
|
24
24
|
from pelican_nlp.utils.setup_functions import subject_instantiator, load_config, remove_previous_derivative_dir
|
25
25
|
from pelican_nlp.preprocessing import LPDS
|
26
26
|
|
27
|
+
project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_fluency/config_fluency.yml'
|
28
|
+
|
27
29
|
class Pelican:
|
28
30
|
|
29
31
|
"""Main class for the Pelican project handling document processing and metric extraction."""
|
@@ -31,7 +33,7 @@ class Pelican:
|
|
31
33
|
def __init__(self, config_path: str = None, dev_mode: bool = True) -> None:
|
32
34
|
self.dev_mode = dev_mode
|
33
35
|
|
34
|
-
# If no config path is provided, use the default config from package
|
36
|
+
# If no config path is provided, use the default config from package; used for dev-mode
|
35
37
|
if config_path is None:
|
36
38
|
package_dir = Path(__file__).parent
|
37
39
|
default_config = package_dir / 'configuration_files' / 'config_fluency.yml'
|
@@ -205,4 +207,4 @@ class Pelican:
|
|
205
207
|
|
206
208
|
|
207
209
|
if __name__ == '__main__':
|
208
|
-
Pelican().run()
|
210
|
+
Pelican(project_path).run()
|
@@ -5,7 +5,7 @@
|
|
5
5
|
# -------------
|
6
6
|
input_file: "text" # Options: 'text' or 'audio'
|
7
7
|
language: "german"
|
8
|
-
recompute_everything: true
|
8
|
+
recompute_everything: true #If false will give warning if output folder already exists
|
9
9
|
|
10
10
|
# Task Configuration
|
11
11
|
# -----------------
|
@@ -20,7 +20,7 @@ corpus_names: # List of task corpora
|
|
20
20
|
multiple_sessions: false
|
21
21
|
number_of_subjects: null # If null, auto-detected
|
22
22
|
number_of_speakers: 1
|
23
|
-
subject_speakertag: null # Speaker tag for subject (e.g., "B")
|
23
|
+
subject_speakertag: null # Speaker tag for subject (e.g., "B"), only for discourse
|
24
24
|
|
25
25
|
# Document Structure
|
26
26
|
# ----------------
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: pelican_nlp
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.7
|
4
4
|
Summary: Preprocessing and Extraction of Linguistic Information for Computational Analysis
|
5
5
|
Author-email: Yves Pauli <yves.pauli@gmail.com>
|
6
6
|
License-Expression: CC-BY-NC-4.0
|
@@ -65,6 +65,18 @@ pelican_nlp stands for "Preprocessing and Extraction of Linguistic Information f
|
|
65
65
|
Installation
|
66
66
|
============
|
67
67
|
|
68
|
+
Create conda environment
|
69
|
+
|
70
|
+
.. code-block:: bash
|
71
|
+
|
72
|
+
conda create -n pelican-nlp -c defaults python=3.10
|
73
|
+
|
74
|
+
Activate environment
|
75
|
+
|
76
|
+
.. code-block:: bash
|
77
|
+
|
78
|
+
conda activate pelican-nlp
|
79
|
+
|
68
80
|
Install the package using pip:
|
69
81
|
|
70
82
|
.. code-block:: bash
|
@@ -80,12 +92,12 @@ For the latest development version:
|
|
80
92
|
Usage
|
81
93
|
=====
|
82
94
|
|
83
|
-
To
|
95
|
+
To run pelican_nlp you need a configuration.yml file in your project directory, which specifies the configurations used for your project.
|
96
|
+
Sample configuration files can be found on the pelican_nlp github repository: https://github.com/ypauli/pelican_nlp/tree/main/sample_configuration_files
|
84
97
|
|
85
|
-
Adapt your configuration file to your needs.
|
86
|
-
ALWAYS change the specified project folder location.
|
98
|
+
Adapt your configuration file to your needs and save your personal configuration.yml file to your main project directory.
|
87
99
|
|
88
|
-
|
100
|
+
Running pelican_nlp with your configurations can be done directly from the command line interface or via Python script.
|
89
101
|
|
90
102
|
Run from command line:
|
91
103
|
|
@@ -93,13 +105,16 @@ Navigate to main project directory in command line and enter the following comma
|
|
93
105
|
|
94
106
|
.. code-block:: bash
|
95
107
|
|
108
|
+
conda activate pelican-nlp
|
96
109
|
pelican-run
|
97
110
|
|
98
111
|
|
99
112
|
Run with python script:
|
100
113
|
|
101
114
|
Create python file with IDE of your choice (e.g. Visual Studio Code, Pycharm, etc.) and copy the following code into the file:
|
115
|
+
Make sure to use the previously created conda environment 'pelican-nlp' for your project.
|
102
116
|
|
117
|
+
Run the following Python code:
|
103
118
|
.. code-block:: python
|
104
119
|
|
105
120
|
from pelican_nlp.main import Pelican
|
@@ -142,8 +157,7 @@ Features
|
|
142
157
|
Examples
|
143
158
|
========
|
144
159
|
|
145
|
-
You can find example setups in the
|
146
|
-
ALWAYS change the path to the project folder specified in the configuration file to your specific project location.
|
160
|
+
You can find example setups on the github repository in the `examples <https://github.com/ypauli/pelican_nlp/tree/main/examples>`_ folder:
|
147
161
|
|
148
162
|
Contributing
|
149
163
|
============
|
@@ -1,7 +1,7 @@
|
|
1
1
|
pelican_nlp/__init__.py,sha256=TD5xjKeXXAH6nUWG-6igbClgovi5r8RIEqI_ix1QeYo,204
|
2
|
-
pelican_nlp/_version.py,sha256=
|
3
|
-
pelican_nlp/cli.py,sha256=
|
4
|
-
pelican_nlp/main.py,sha256=
|
2
|
+
pelican_nlp/_version.py,sha256=LIho7asb0pp1iNbJvXEhRMluyGN4gB4RHIIbAKpROsc,21
|
3
|
+
pelican_nlp/cli.py,sha256=mPz-ASIMUme69G6YGVpTnHr5VfM3XA4h29WFd7DXpa4,588
|
4
|
+
pelican_nlp/main.py,sha256=HX2Rbl4j7RXaMXlGCtggBBqcg3gRh-ey1PdLsQcDX30,7660
|
5
5
|
pelican_nlp/Nils_backup/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
6
|
pelican_nlp/Nils_backup/extract_acoustic_features.py,sha256=eSP8lXxbZ15YE1HqxGtma9uWOcSN-fI-ig-NwQ9eOA8,10771
|
7
7
|
pelican_nlp/Nils_backup/speaker_diarization_Nils.py,sha256=3RIhjKihu4Z1rruMt9KESFE2lqesfzIpRr7rLummUEo,10219
|
@@ -46,7 +46,7 @@ pelican_nlp/extraction/acoustic_feature_extraction.py,sha256=6Csrr6uotarhuAzxYlG
|
|
46
46
|
pelican_nlp/extraction/distance_from_randomness.py,sha256=yikZ3GK2dqpzuNFPVsjuUK0lo6kHOIoIhKPaVrGXRMQ,3365
|
47
47
|
pelican_nlp/extraction/extract_embeddings.py,sha256=e5bcNlskd7f-JkWtfd7YutGV5bqcURKrAkETRyTx93Q,2457
|
48
48
|
pelican_nlp/extraction/extract_logits.py,sha256=Lc7Es86T8mlSvLMhiDHpFdCc0kCZ9fNr3-VFnOyeybs,3869
|
49
|
-
pelican_nlp/extraction/language_model.py,sha256=
|
49
|
+
pelican_nlp/extraction/language_model.py,sha256=npew_4ziTCNE87pjN8LL0eTPujlewVr8pMT7BsmzEjo,4038
|
50
50
|
pelican_nlp/extraction/semantic_similarity.py,sha256=QhY5CAOAorxEo3UBWPlMegFvbySF0KH6j4j3m2I3_NY,2552
|
51
51
|
pelican_nlp/extraction/test_documents/test_features.csv,sha256=LR_3m4vIm-YWKw5gI5ziswhS-NF9VhKv14c2udLxtJU,488482
|
52
52
|
pelican_nlp/extraction/test_documents/wallace_1.15_3.txt,sha256=ShXxOHUZzGPNUqIcOn6-OYkarzNtTC22V05a_Xpvtlw,3731
|
@@ -64,14 +64,14 @@ pelican_nlp/preprocessing/text_normalizer.py,sha256=huo5VFqJ0p2jq-ud1047XvMu1qNe
|
|
64
64
|
pelican_nlp/preprocessing/text_tokenizer.py,sha256=h875bXr0YuMrLh4HtQUvpHmASScddtkQXGaF9mm7uwU,1642
|
65
65
|
pelican_nlp/sample_configuration_files/config_discourse.yml,sha256=OaTCoMwhDjrOIBpw1nKXWIoSWRUUFNjGQdgQZHVrJn0,3570
|
66
66
|
pelican_nlp/sample_configuration_files/config_fluency.yml,sha256=JYpq90K4AF5TslzESJK6Nidw6-D1IiqD_6cdmlCd5-w,2990
|
67
|
-
pelican_nlp/sample_configuration_files/config_general.yml,sha256
|
67
|
+
pelican_nlp/sample_configuration_files/config_general.yml,sha256=-GAVATlqXuQq4ANSW0JauwIGhr7ET_oZiBiM7I40AkA,3424
|
68
68
|
pelican_nlp/utils/__init__.py,sha256=q1tGdOOj5UPRC2mGhoMUh8p4cbFCkkbD21bQaOVvFao,189
|
69
69
|
pelican_nlp/utils/csv_functions.py,sha256=hsG73gm3Up9sAerp6gIxuNHaeP1vJj6HSh7ggVm1SSo,7272
|
70
70
|
pelican_nlp/utils/sample_usage.py,sha256=W__OVMjWND-ZtxxRhfGJDHwbVpGlB-anXDxyA5P4cME,353
|
71
71
|
pelican_nlp/utils/setup_functions.py,sha256=t4WG5qd5iYpNNBGklje_8ukwmJp_C9RMLLi7veDgNeA,3574
|
72
|
-
pelican_nlp-0.2.
|
73
|
-
pelican_nlp-0.2.
|
74
|
-
pelican_nlp-0.2.
|
75
|
-
pelican_nlp-0.2.
|
76
|
-
pelican_nlp-0.2.
|
77
|
-
pelican_nlp-0.2.
|
72
|
+
pelican_nlp-0.2.7.dist-info/licenses/LICENSE,sha256=m3jshBZIXKiBX6qhmhtJcLTVJ1N6BEkQGIflneXvpYg,19336
|
73
|
+
pelican_nlp-0.2.7.dist-info/METADATA,sha256=YyZBYza89dtKbvLLHXkxOEZ1BODloXBjh-zZSODLfVI,6155
|
74
|
+
pelican_nlp-0.2.7.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
75
|
+
pelican_nlp-0.2.7.dist-info/entry_points.txt,sha256=znlG0paAfju9P10UM3rm5HcCHoj4tarTllNpeaqH_gc,53
|
76
|
+
pelican_nlp-0.2.7.dist-info/top_level.txt,sha256=F0qlyqy5FCd3sTS_npUYPeLKN9_BZq6wD4qo9pI0xbg,12
|
77
|
+
pelican_nlp-0.2.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|