pelican-nlp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pelican_nlp/__init__.py +9 -0
- pelican_nlp/core/__init__.py +5 -0
- pelican_nlp/core/audio_document.py +20 -0
- pelican_nlp/core/corpus.py +296 -0
- pelican_nlp/core/document.py +226 -0
- pelican_nlp/core/subject.py +30 -0
- pelican_nlp/extraction/__init__.py +2 -0
- pelican_nlp/extraction/acoustic_feature_extraction.py +71 -0
- pelican_nlp/extraction/distance_from_randomness.py +109 -0
- pelican_nlp/extraction/extract_embeddings.py +57 -0
- pelican_nlp/extraction/extract_logits.py +102 -0
- pelican_nlp/extraction/language_model.py +71 -0
- pelican_nlp/extraction/semantic_similarity.py +60 -0
- pelican_nlp/extraction/test_documents/test_features.csv +4 -0
- pelican_nlp/extraction/test_documents/wallace_1.15_3.txt +1 -0
- pelican_nlp/extraction/test_documents/wallace_1.1_3.txt +1 -0
- pelican_nlp/extraction/test_documents/wallace_1_4.txt +1 -0
- pelican_nlp/main.py +211 -0
- pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py +34 -0
- pelican_nlp/preprocessing/LPDS.py +77 -0
- pelican_nlp/preprocessing/__init__.py +7 -0
- pelican_nlp/preprocessing/pipeline.py +50 -0
- pelican_nlp/preprocessing/speaker_diarization.py +33 -0
- pelican_nlp/preprocessing/text_cleaner.py +224 -0
- pelican_nlp/preprocessing/text_importer.py +42 -0
- pelican_nlp/preprocessing/text_normalizer.py +24 -0
- pelican_nlp/preprocessing/text_tokenizer.py +43 -0
- pelican_nlp/sample_configuration_files/config_discourse.yml +103 -0
- pelican_nlp/sample_configuration_files/config_fluency.yml +108 -0
- pelican_nlp/sample_configuration_files/config_general.yml +131 -0
- pelican_nlp/utils/__init__.py +3 -0
- pelican_nlp/utils/csv_functions.py +193 -0
- pelican_nlp/utils/sample_usage.py +17 -0
- pelican_nlp/utils/setup_functions.py +93 -0
- pelican_nlp-0.1.0.dist-info/METADATA +146 -0
- pelican_nlp-0.1.0.dist-info/RECORD +39 -0
- pelican_nlp-0.1.0.dist-info/WHEEL +5 -0
- pelican_nlp-0.1.0.dist-info/licenses/LICENSE +400 -0
- pelican_nlp-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,93 @@
|
|
1
|
+
import os
|
2
|
+
from pelican_nlp.core.subject import Subject
|
3
|
+
import shutil
|
4
|
+
import yaml
|
5
|
+
import sys
|
6
|
+
|
7
|
+
def subject_instantiator(config):
|
8
|
+
project_folder = config['PATH_TO_PROJECT_FOLDER']
|
9
|
+
path_to_subjects = os.path.join(project_folder, 'subjects')
|
10
|
+
print('Instantiating Subjects...')
|
11
|
+
subjects = [Subject(subject) for subject in os.listdir(path_to_subjects)]
|
12
|
+
|
13
|
+
# Identifying all subject files
|
14
|
+
for subject in subjects:
|
15
|
+
if config['multiple_sessions']:
|
16
|
+
paths = _get_subject_sessions(subject, project_folder)
|
17
|
+
else:
|
18
|
+
paths = [os.path.join(path_to_subjects, subject.subjectID)]
|
19
|
+
|
20
|
+
for path in paths:
|
21
|
+
file_path = os.path.join(path, config['task_name'])
|
22
|
+
subject.documents.extend(_instantiate_documents(file_path, subject.subjectID, config))
|
23
|
+
print(f'all identified subject documents for subject {subject.subjectID}: {subject.documents}')
|
24
|
+
for document in subject.documents:
|
25
|
+
parts = document.file_path.split(os.sep)
|
26
|
+
|
27
|
+
# Adjust path components based on whether session exists
|
28
|
+
if config.get('multiple_sessions', False):
|
29
|
+
subject_ID, session, task = parts[-4], parts[-3], parts[-2]
|
30
|
+
document.results_path = os.path.join(project_folder, 'derivatives', subject_ID, session, task)
|
31
|
+
else:
|
32
|
+
subject_ID, task = parts[-3], parts[-2]
|
33
|
+
document.results_path = os.path.join(project_folder, 'derivatives', subject_ID, task)
|
34
|
+
|
35
|
+
return subjects
|
36
|
+
|
37
|
+
def _get_subject_sessions(subject, project_path):
|
38
|
+
session_dir = os.path.join(os.path.join(project_path, 'subjects'), subject.subjectID)
|
39
|
+
session_paths = [
|
40
|
+
os.path.join(session_dir, session)
|
41
|
+
for session in os.listdir(session_dir)
|
42
|
+
if os.path.isdir(os.path.join(session_dir, session))
|
43
|
+
]
|
44
|
+
subject.numberOfSessions = len(session_paths)
|
45
|
+
return session_paths
|
46
|
+
|
47
|
+
def _instantiate_documents(filepath, subject, config):
|
48
|
+
|
49
|
+
if config['input_file']=='text':
|
50
|
+
from pelican_nlp.core.document import Document
|
51
|
+
return [
|
52
|
+
Document(
|
53
|
+
filepath,
|
54
|
+
file_name,
|
55
|
+
subject_ID = subject,
|
56
|
+
task=config['task_name'],
|
57
|
+
fluency=config['fluency_task'],
|
58
|
+
has_sections=config['has_multiple_sections'],
|
59
|
+
section_identifier=config['section_identification'],
|
60
|
+
number_of_sections=config['number_of_sections'],
|
61
|
+
num_speakers=config['number_of_speakers'],
|
62
|
+
has_section_titles=config['has_section_titles']
|
63
|
+
)
|
64
|
+
for file_name in os.listdir(filepath)
|
65
|
+
]
|
66
|
+
|
67
|
+
elif config['input_file']=='audio':
|
68
|
+
from pelican_nlp.core.audio_document import AudioFile
|
69
|
+
return [
|
70
|
+
AudioFile(
|
71
|
+
filepath,
|
72
|
+
file_name,
|
73
|
+
subject_ID=subject,
|
74
|
+
task=config['task_name'],
|
75
|
+
fluency=config['fluency_task'],
|
76
|
+
num_speakers=config['number_of_speakers'],
|
77
|
+
)
|
78
|
+
for file_name in os.listdir(filepath)
|
79
|
+
]
|
80
|
+
|
81
|
+
def remove_previous_derivative_dir(output_directory):
|
82
|
+
if os.path.isdir(output_directory):
|
83
|
+
shutil.rmtree(output_directory)
|
84
|
+
|
85
|
+
def ignore_files(directory, files):
|
86
|
+
return [f for f in files if os.path.isfile(os.path.join(directory, f))]
|
87
|
+
|
88
|
+
def load_config(config_path):
|
89
|
+
try:
|
90
|
+
with open(config_path, 'r') as stream:
|
91
|
+
return yaml.safe_load(stream)
|
92
|
+
except yaml.YAMLError as exc:
|
93
|
+
sys.exit(f"Error loading configuration: {exc}")
|
@@ -0,0 +1,146 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: pelican_nlp
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: Preprocessing and Extraction of Linguistic Information for Computational Analysis
|
5
|
+
Author-email: Yves Pauli <yves.pauli@gmail.com>
|
6
|
+
License-Expression: CC-BY-NC-4.0
|
7
|
+
Project-URL: Homepage, https://github.com/ypauli/PELICAN_nlp
|
8
|
+
Project-URL: Repository, https://github.com/ypauli/PELICAN_nlp
|
9
|
+
Project-URL: Documentation, https://github.com/ypauli/PELICAN_nlp#readme
|
10
|
+
Project-URL: Bug Tracker, https://github.com/ypauli/PELICAN_nlp/issues
|
11
|
+
Keywords: nlp,linguistics,preprocessing,language-processing,text-analysis
|
12
|
+
Classifier: Development Status :: 1 - Planning
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
14
|
+
Classifier: Operating System :: POSIX :: Linux
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
17
|
+
Classifier: Natural Language :: English
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
19
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
20
|
+
Requires-Python: >=3.10
|
21
|
+
Description-Content-Type: text/x-rst
|
22
|
+
License-File: LICENSE
|
23
|
+
Requires-Dist: numpy==2.0.1
|
24
|
+
Requires-Dist: pandas==2.2.3
|
25
|
+
Requires-Dist: PyYAML>=6.0.2
|
26
|
+
Requires-Dist: torch==2.5.1
|
27
|
+
Requires-Dist: spacy==3.8.2
|
28
|
+
Requires-Dist: transformers==4.49.0
|
29
|
+
Requires-Dist: docx2txt>=0.9
|
30
|
+
Requires-Dist: striprtf>=0.0.28
|
31
|
+
Requires-Dist: chardet>=4.0.0
|
32
|
+
Requires-Dist: scikit_learn>=1.6.1
|
33
|
+
Requires-Dist: scipy==1.15.2
|
34
|
+
Requires-Dist: fasttext-wheel==0.9.2
|
35
|
+
Requires-Dist: matplotlib>=3.10.0
|
36
|
+
Requires-Dist: seaborn>=0.13.2
|
37
|
+
Requires-Dist: accelerate==1.4.0
|
38
|
+
Requires-Dist: editdistance>=0.8.1
|
39
|
+
Requires-Dist: psutil>=6.1.0
|
40
|
+
Requires-Dist: tqdm==4.67.1
|
41
|
+
Requires-Dist: pytest>=8.3.4
|
42
|
+
Requires-Dist: statsmodels>=0.14.4
|
43
|
+
Requires-Dist: datasets==3.3.2
|
44
|
+
Requires-Dist: huggingface_hub==0.29.2
|
45
|
+
Dynamic: license-file
|
46
|
+
|
47
|
+
====================================
|
48
|
+
PELICAN_nlp
|
49
|
+
====================================
|
50
|
+
|
51
|
+
PELICAN_nlp stands for "Preprocessing and Extraction of Linguistic Information for Computational Analysis - Natural Language Processing". This package enables the creation of standardized and reproducible language processing pipelines, extracting linguistic features from various tasks like discourse, fluency, and image descriptions.
|
52
|
+
|
53
|
+
.. image:: https://img.shields.io/pypi/v/package-name.svg
|
54
|
+
:target: https://pypi.org/project/package-name/
|
55
|
+
:alt: PyPI version
|
56
|
+
|
57
|
+
.. image:: https://img.shields.io/github/license/username/package-name.svg
|
58
|
+
:target: https://github.com/ypauli/PELICAN-nlp/blob/main/LICENSE
|
59
|
+
:alt: License
|
60
|
+
|
61
|
+
.. image:: https://img.shields.io/pypi/pyversions/package-name.svg
|
62
|
+
:target: https://pypi.org/project/package-name/
|
63
|
+
:alt: Supported Python Versions
|
64
|
+
|
65
|
+
Installation
|
66
|
+
============
|
67
|
+
|
68
|
+
Install the package using pip:
|
69
|
+
|
70
|
+
.. code-block:: bash
|
71
|
+
|
72
|
+
pip install pelican_nlp
|
73
|
+
|
74
|
+
For the latest development version:
|
75
|
+
|
76
|
+
.. code-block:: bash
|
77
|
+
|
78
|
+
pip install git+https://github.com/ypauli/PELICAN_nlp.git
|
79
|
+
|
80
|
+
Usage
|
81
|
+
=====
|
82
|
+
|
83
|
+
To use the pelican_nlp package:
|
84
|
+
|
85
|
+
.. code-block:: python
|
86
|
+
|
87
|
+
from pelican_nlp.main import Pelican
|
88
|
+
|
89
|
+
configuration_file = "/path/to/your/config/file"
|
90
|
+
pelican = Pelican(configuration_file)
|
91
|
+
pelican.run()
|
92
|
+
|
93
|
+
For reliable operation, data must be stored in the *Language Processing Data Structure (LPDS)* format, inspired by brain imaging data structure conventions.
|
94
|
+
|
95
|
+
Text and audio files should follow this naming convention:
|
96
|
+
|
97
|
+
subjectID_sessionID_task_task-supplement_corpus.extension
|
98
|
+
|
99
|
+
- subjectID: ID of subject (e.g., sub-01), mandatory
|
100
|
+
- sessionID: ID of session (e.g., ses-01), if available
|
101
|
+
- task: task used for file creation, mandatory
|
102
|
+
- task-supplement: additional information regarding the task, if available
|
103
|
+
- corpus: (e.g., healthy-control / patient) specify files belonging to the same group, mandatory
|
104
|
+
- extension: file extension (e.g., txt / pdf / docx / rtf), mandatory
|
105
|
+
|
106
|
+
Example filenames:
|
107
|
+
- sub-01_ses-01_interview_schizophrenia.rtf
|
108
|
+
- sub-03_ses-02_fluency_semantic_animals.docx
|
109
|
+
|
110
|
+
To optimize performance, close other programs and limit GPU usage during language processing.
|
111
|
+
|
112
|
+
Features
|
113
|
+
========
|
114
|
+
|
115
|
+
- **Feature 1: Cleaning text files**
|
116
|
+
- Handles whitespaces, timestamps, punctuation, special characters, and case-sensitivity.
|
117
|
+
|
118
|
+
- **Feature 2: Linguistic Feature Extraction**
|
119
|
+
- Extracts semantic embeddings, logits, distance from optimality, and semantic similarity.
|
120
|
+
|
121
|
+
Examples
|
122
|
+
========
|
123
|
+
|
124
|
+
Here's a detailed usage example:
|
125
|
+
|
126
|
+
.. code-block:: python
|
127
|
+
|
128
|
+
from package_name import SomeClass
|
129
|
+
|
130
|
+
configuration_file = "config_fluency.yml"
|
131
|
+
pelican.run(configuration_file)
|
132
|
+
|
133
|
+
*Link to config_fluency.yml*
|
134
|
+
|
135
|
+
Sample folder for data collection of the semantic fluency task:
|
136
|
+
*Link to sample_folder*
|
137
|
+
|
138
|
+
Contributing
|
139
|
+
============
|
140
|
+
|
141
|
+
Contributions are welcome! Please check out the `contributing guide <https://github.com/ypauli/PELICAN/blob/main/CONTRIBUTING.md>`_.
|
142
|
+
|
143
|
+
License
|
144
|
+
=======
|
145
|
+
|
146
|
+
This project is licensed under Attribution-NonCommercial 4.0 International. See the `LICENSE <https://github.com/ypauli/PELICAN/blob/main/LICENSE>`_ file for details.
|
@@ -0,0 +1,39 @@
|
|
1
|
+
pelican_nlp/__init__.py,sha256=yLyG5Amt7nuHQMHz0tuGUVsGBtgVmXT0LMeRo3an-OU,192
|
2
|
+
pelican_nlp/main.py,sha256=xKUqqA3sh9kbk07lKA_poILIU1c8oIeaSsVqPOPY5Tk,7596
|
3
|
+
pelican_nlp/core/__init__.py,sha256=whJc5dWsGsKn2IAw-D4BvCvUKW1sVtWYE1WJIuUr5uI,165
|
4
|
+
pelican_nlp/core/audio_document.py,sha256=hhSJNgeqSYa6_uws2ho66agHhAdHuKN3EIEdIsIcXKg,586
|
5
|
+
pelican_nlp/core/corpus.py,sha256=6pDRmeO0XoHylhjLE4Fi5Tc3HCMQJ-Xk0YRzEfz5Z1Y,15168
|
6
|
+
pelican_nlp/core/document.py,sha256=j2HP5FX6cfmXHo7OWVFCX6cMsDyqsOmNlnGNNNfCm2c,8467
|
7
|
+
pelican_nlp/core/subject.py,sha256=-pi3jDzb2zLiG8JNAi9i-9Jd-VtsPxDO4ShQci2QSMg,1059
|
8
|
+
pelican_nlp/extraction/__init__.py,sha256=hfqFiaKpQBS6cwRm9Yd7MpOcV60_xJmwuQ2Kegary5k,84
|
9
|
+
pelican_nlp/extraction/acoustic_feature_extraction.py,sha256=6Csrr6uotarhuAzxYlGFAil9K4PLUqa9vWw607peRoA,2319
|
10
|
+
pelican_nlp/extraction/distance_from_randomness.py,sha256=yikZ3GK2dqpzuNFPVsjuUK0lo6kHOIoIhKPaVrGXRMQ,3365
|
11
|
+
pelican_nlp/extraction/extract_embeddings.py,sha256=e5bcNlskd7f-JkWtfd7YutGV5bqcURKrAkETRyTx93Q,2457
|
12
|
+
pelican_nlp/extraction/extract_logits.py,sha256=Lc7Es86T8mlSvLMhiDHpFdCc0kCZ9fNr3-VFnOyeybs,3869
|
13
|
+
pelican_nlp/extraction/language_model.py,sha256=4tHJZIRCEeHVTwEf2jmOtu-zDGkdXiDjKmlpuxDuLiw,2929
|
14
|
+
pelican_nlp/extraction/semantic_similarity.py,sha256=QhY5CAOAorxEo3UBWPlMegFvbySF0KH6j4j3m2I3_NY,2552
|
15
|
+
pelican_nlp/extraction/test_documents/test_features.csv,sha256=LR_3m4vIm-YWKw5gI5ziswhS-NF9VhKv14c2udLxtJU,488482
|
16
|
+
pelican_nlp/extraction/test_documents/wallace_1.15_3.txt,sha256=ShXxOHUZzGPNUqIcOn6-OYkarzNtTC22V05a_Xpvtlw,3731
|
17
|
+
pelican_nlp/extraction/test_documents/wallace_1.1_3.txt,sha256=gs5REE10myK3Nm9JBOV8hjqKcMRkrl7BasuK7HSBe5M,3695
|
18
|
+
pelican_nlp/extraction/test_documents/wallace_1_4.txt,sha256=95Z7gS92KERCocrbOAFbJntf5QoE-6p0GL67XQEffqI,3963
|
19
|
+
pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py,sha256=svXXyLEA62mLa0KUfSiOSFFMjYk17K7BJbxUoLf0l9w,1468
|
20
|
+
pelican_nlp/preprocessing/LPDS.py,sha256=4UWkMMSrdU-nWVi8eKiWQSGD7f7lemB42aI0fFn6ZLU,4097
|
21
|
+
pelican_nlp/preprocessing/__init__.py,sha256=ZYgOUlKPXmltYez3urPZmsAWRWSEqZ3_l_gN2aqd15s,293
|
22
|
+
pelican_nlp/preprocessing/pipeline.py,sha256=t2zJAvZRO12MdAKQgm8XZxfZND7_8gFtzHF9Rq2L2aE,1796
|
23
|
+
pelican_nlp/preprocessing/speaker_diarization.py,sha256=N6dZCa2AHHGw__g9e-ZUyZM_In0-nzFOkZ44cBnoKLk,1122
|
24
|
+
pelican_nlp/preprocessing/text_cleaner.py,sha256=QKqxwoRR8dnuBYiY-PXK1kB7744TVUcUMJb7dbKvXGk,7512
|
25
|
+
pelican_nlp/preprocessing/text_importer.py,sha256=FtSyJjFXDxVle7Jpyw6EqCLDbLTCRxqVQi9ymWWtPB4,1356
|
26
|
+
pelican_nlp/preprocessing/text_normalizer.py,sha256=huo5VFqJ0p2jq-ud1047XvMu1qNeaiuG879SF3zkJoM,894
|
27
|
+
pelican_nlp/preprocessing/text_tokenizer.py,sha256=h875bXr0YuMrLh4HtQUvpHmASScddtkQXGaF9mm7uwU,1642
|
28
|
+
pelican_nlp/sample_configuration_files/config_discourse.yml,sha256=xVHIUpSORV6iR0nEvuess6rfiAvuGEkqmaMWD_6kyFE,3618
|
29
|
+
pelican_nlp/sample_configuration_files/config_fluency.yml,sha256=oQ6Y2BhRLExEMpS3VRH2pFrGHi788L66aSYUm05nV_A,3038
|
30
|
+
pelican_nlp/sample_configuration_files/config_general.yml,sha256=UuGnZUa-SVmioE9NmXWOMKuv3uG5mNjIuXgA6-Y0JS0,3440
|
31
|
+
pelican_nlp/utils/__init__.py,sha256=q1tGdOOj5UPRC2mGhoMUh8p4cbFCkkbD21bQaOVvFao,189
|
32
|
+
pelican_nlp/utils/csv_functions.py,sha256=hsG73gm3Up9sAerp6gIxuNHaeP1vJj6HSh7ggVm1SSo,7272
|
33
|
+
pelican_nlp/utils/sample_usage.py,sha256=W__OVMjWND-ZtxxRhfGJDHwbVpGlB-anXDxyA5P4cME,353
|
34
|
+
pelican_nlp/utils/setup_functions.py,sha256=s0QcarswU8qeFBcEQNIYC1ooaD-xwRiTJn--yPEId8E,3612
|
35
|
+
pelican_nlp-0.1.0.dist-info/licenses/LICENSE,sha256=m3jshBZIXKiBX6qhmhtJcLTVJ1N6BEkQGIflneXvpYg,19336
|
36
|
+
pelican_nlp-0.1.0.dist-info/METADATA,sha256=kIWgpFUOeQC1c-DYvSPoN82OXBgV7TJtPLUGLNC5KDs,4947
|
37
|
+
pelican_nlp-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
38
|
+
pelican_nlp-0.1.0.dist-info/top_level.txt,sha256=F0qlyqy5FCd3sTS_npUYPeLKN9_BZq6wD4qo9pI0xbg,12
|
39
|
+
pelican_nlp-0.1.0.dist-info/RECORD,,
|