pelican-nlp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. pelican_nlp/__init__.py +9 -0
  2. pelican_nlp/core/__init__.py +5 -0
  3. pelican_nlp/core/audio_document.py +20 -0
  4. pelican_nlp/core/corpus.py +296 -0
  5. pelican_nlp/core/document.py +226 -0
  6. pelican_nlp/core/subject.py +30 -0
  7. pelican_nlp/extraction/__init__.py +2 -0
  8. pelican_nlp/extraction/acoustic_feature_extraction.py +71 -0
  9. pelican_nlp/extraction/distance_from_randomness.py +109 -0
  10. pelican_nlp/extraction/extract_embeddings.py +57 -0
  11. pelican_nlp/extraction/extract_logits.py +102 -0
  12. pelican_nlp/extraction/language_model.py +71 -0
  13. pelican_nlp/extraction/semantic_similarity.py +60 -0
  14. pelican_nlp/extraction/test_documents/test_features.csv +4 -0
  15. pelican_nlp/extraction/test_documents/wallace_1.15_3.txt +1 -0
  16. pelican_nlp/extraction/test_documents/wallace_1.1_3.txt +1 -0
  17. pelican_nlp/extraction/test_documents/wallace_1_4.txt +1 -0
  18. pelican_nlp/main.py +211 -0
  19. pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py +34 -0
  20. pelican_nlp/preprocessing/LPDS.py +77 -0
  21. pelican_nlp/preprocessing/__init__.py +7 -0
  22. pelican_nlp/preprocessing/pipeline.py +50 -0
  23. pelican_nlp/preprocessing/speaker_diarization.py +33 -0
  24. pelican_nlp/preprocessing/text_cleaner.py +224 -0
  25. pelican_nlp/preprocessing/text_importer.py +42 -0
  26. pelican_nlp/preprocessing/text_normalizer.py +24 -0
  27. pelican_nlp/preprocessing/text_tokenizer.py +43 -0
  28. pelican_nlp/sample_configuration_files/config_discourse.yml +103 -0
  29. pelican_nlp/sample_configuration_files/config_fluency.yml +108 -0
  30. pelican_nlp/sample_configuration_files/config_general.yml +131 -0
  31. pelican_nlp/utils/__init__.py +3 -0
  32. pelican_nlp/utils/csv_functions.py +193 -0
  33. pelican_nlp/utils/sample_usage.py +17 -0
  34. pelican_nlp/utils/setup_functions.py +93 -0
  35. pelican_nlp-0.1.0.dist-info/METADATA +146 -0
  36. pelican_nlp-0.1.0.dist-info/RECORD +39 -0
  37. pelican_nlp-0.1.0.dist-info/WHEEL +5 -0
  38. pelican_nlp-0.1.0.dist-info/licenses/LICENSE +400 -0
  39. pelican_nlp-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,93 @@
1
+ import os
2
+ from pelican_nlp.core.subject import Subject
3
+ import shutil
4
+ import yaml
5
+ import sys
6
+
7
+ def subject_instantiator(config):
8
+ project_folder = config['PATH_TO_PROJECT_FOLDER']
9
+ path_to_subjects = os.path.join(project_folder, 'subjects')
10
+ print('Instantiating Subjects...')
11
+ subjects = [Subject(subject) for subject in os.listdir(path_to_subjects)]
12
+
13
+ # Identifying all subject files
14
+ for subject in subjects:
15
+ if config['multiple_sessions']:
16
+ paths = _get_subject_sessions(subject, project_folder)
17
+ else:
18
+ paths = [os.path.join(path_to_subjects, subject.subjectID)]
19
+
20
+ for path in paths:
21
+ file_path = os.path.join(path, config['task_name'])
22
+ subject.documents.extend(_instantiate_documents(file_path, subject.subjectID, config))
23
+ print(f'all identified subject documents for subject {subject.subjectID}: {subject.documents}')
24
+ for document in subject.documents:
25
+ parts = document.file_path.split(os.sep)
26
+
27
+ # Adjust path components based on whether session exists
28
+ if config.get('multiple_sessions', False):
29
+ subject_ID, session, task = parts[-4], parts[-3], parts[-2]
30
+ document.results_path = os.path.join(project_folder, 'derivatives', subject_ID, session, task)
31
+ else:
32
+ subject_ID, task = parts[-3], parts[-2]
33
+ document.results_path = os.path.join(project_folder, 'derivatives', subject_ID, task)
34
+
35
+ return subjects
36
+
37
+ def _get_subject_sessions(subject, project_path):
38
+ session_dir = os.path.join(os.path.join(project_path, 'subjects'), subject.subjectID)
39
+ session_paths = [
40
+ os.path.join(session_dir, session)
41
+ for session in os.listdir(session_dir)
42
+ if os.path.isdir(os.path.join(session_dir, session))
43
+ ]
44
+ subject.numberOfSessions = len(session_paths)
45
+ return session_paths
46
+
47
+ def _instantiate_documents(filepath, subject, config):
48
+
49
+ if config['input_file']=='text':
50
+ from pelican_nlp.core.document import Document
51
+ return [
52
+ Document(
53
+ filepath,
54
+ file_name,
55
+ subject_ID = subject,
56
+ task=config['task_name'],
57
+ fluency=config['fluency_task'],
58
+ has_sections=config['has_multiple_sections'],
59
+ section_identifier=config['section_identification'],
60
+ number_of_sections=config['number_of_sections'],
61
+ num_speakers=config['number_of_speakers'],
62
+ has_section_titles=config['has_section_titles']
63
+ )
64
+ for file_name in os.listdir(filepath)
65
+ ]
66
+
67
+ elif config['input_file']=='audio':
68
+ from pelican_nlp.core.audio_document import AudioFile
69
+ return [
70
+ AudioFile(
71
+ filepath,
72
+ file_name,
73
+ subject_ID=subject,
74
+ task=config['task_name'],
75
+ fluency=config['fluency_task'],
76
+ num_speakers=config['number_of_speakers'],
77
+ )
78
+ for file_name in os.listdir(filepath)
79
+ ]
80
+
81
+ def remove_previous_derivative_dir(output_directory):
82
+ if os.path.isdir(output_directory):
83
+ shutil.rmtree(output_directory)
84
+
85
+ def ignore_files(directory, files):
86
+ return [f for f in files if os.path.isfile(os.path.join(directory, f))]
87
+
88
+ def load_config(config_path):
89
+ try:
90
+ with open(config_path, 'r') as stream:
91
+ return yaml.safe_load(stream)
92
+ except yaml.YAMLError as exc:
93
+ sys.exit(f"Error loading configuration: {exc}")
@@ -0,0 +1,146 @@
1
+ Metadata-Version: 2.4
2
+ Name: pelican_nlp
3
+ Version: 0.1.0
4
+ Summary: Preprocessing and Extraction of Linguistic Information for Computational Analysis
5
+ Author-email: Yves Pauli <yves.pauli@gmail.com>
6
+ License-Expression: CC-BY-NC-4.0
7
+ Project-URL: Homepage, https://github.com/ypauli/PELICAN_nlp
8
+ Project-URL: Repository, https://github.com/ypauli/PELICAN_nlp
9
+ Project-URL: Documentation, https://github.com/ypauli/PELICAN_nlp#readme
10
+ Project-URL: Bug Tracker, https://github.com/ypauli/PELICAN_nlp/issues
11
+ Keywords: nlp,linguistics,preprocessing,language-processing,text-analysis
12
+ Classifier: Development Status :: 1 - Planning
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: Operating System :: POSIX :: Linux
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Natural Language :: English
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Text Processing :: Linguistic
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/x-rst
22
+ License-File: LICENSE
23
+ Requires-Dist: numpy==2.0.1
24
+ Requires-Dist: pandas==2.2.3
25
+ Requires-Dist: PyYAML>=6.0.2
26
+ Requires-Dist: torch==2.5.1
27
+ Requires-Dist: spacy==3.8.2
28
+ Requires-Dist: transformers==4.49.0
29
+ Requires-Dist: docx2txt>=0.9
30
+ Requires-Dist: striprtf>=0.0.28
31
+ Requires-Dist: chardet>=4.0.0
32
+ Requires-Dist: scikit_learn>=1.6.1
33
+ Requires-Dist: scipy==1.15.2
34
+ Requires-Dist: fasttext-wheel==0.9.2
35
+ Requires-Dist: matplotlib>=3.10.0
36
+ Requires-Dist: seaborn>=0.13.2
37
+ Requires-Dist: accelerate==1.4.0
38
+ Requires-Dist: editdistance>=0.8.1
39
+ Requires-Dist: psutil>=6.1.0
40
+ Requires-Dist: tqdm==4.67.1
41
+ Requires-Dist: pytest>=8.3.4
42
+ Requires-Dist: statsmodels>=0.14.4
43
+ Requires-Dist: datasets==3.3.2
44
+ Requires-Dist: huggingface_hub==0.29.2
45
+ Dynamic: license-file
46
+
47
+ ====================================
48
+ PELICAN_nlp
49
+ ====================================
50
+
51
+ PELICAN_nlp stands for "Preprocessing and Extraction of Linguistic Information for Computational Analysis - Natural Language Processing". This package enables the creation of standardized and reproducible language processing pipelines, extracting linguistic features from various tasks like discourse, fluency, and image descriptions.
52
+
53
+ .. image:: https://img.shields.io/pypi/v/package-name.svg
54
+ :target: https://pypi.org/project/package-name/
55
+ :alt: PyPI version
56
+
57
+ .. image:: https://img.shields.io/github/license/username/package-name.svg
58
+ :target: https://github.com/ypauli/PELICAN-nlp/blob/main/LICENSE
59
+ :alt: License
60
+
61
+ .. image:: https://img.shields.io/pypi/pyversions/package-name.svg
62
+ :target: https://pypi.org/project/package-name/
63
+ :alt: Supported Python Versions
64
+
65
+ Installation
66
+ ============
67
+
68
+ Install the package using pip:
69
+
70
+ .. code-block:: bash
71
+
72
+ pip install pelican_nlp
73
+
74
+ For the latest development version:
75
+
76
+ .. code-block:: bash
77
+
78
+ pip install git+https://github.com/ypauli/PELICAN_nlp.git
79
+
80
+ Usage
81
+ =====
82
+
83
+ To use the pelican_nlp package:
84
+
85
+ .. code-block:: python
86
+
87
+ from pelican_nlp.main import Pelican
88
+
89
+ configuration_file = "/path/to/your/config/file"
90
+ pelican = Pelican(configuration_file)
91
+ pelican.run()
92
+
93
+ For reliable operation, data must be stored in the *Language Processing Data Structure (LPDS)* format, inspired by brain imaging data structure conventions.
94
+
95
+ Text and audio files should follow this naming convention:
96
+
97
+ subjectID_sessionID_task_task-supplement_corpus.extension
98
+
99
+ - subjectID: ID of subject (e.g., sub-01), mandatory
100
+ - sessionID: ID of session (e.g., ses-01), if available
101
+ - task: task used for file creation, mandatory
102
+ - task-supplement: additional information regarding the task, if available
103
+ - corpus: (e.g., healthy-control / patient) specify files belonging to the same group, mandatory
104
+ - extension: file extension (e.g., txt / pdf / docx / rtf), mandatory
105
+
106
+ Example filenames:
107
+ - sub-01_ses-01_interview_schizophrenia.rtf
108
+ - sub-03_ses-02_fluency_semantic_animals.docx
109
+
110
+ To optimize performance, close other programs and limit GPU usage during language processing.
111
+
112
+ Features
113
+ ========
114
+
115
+ - **Feature 1: Cleaning text files**
116
+ - Handles whitespaces, timestamps, punctuation, special characters, and case-sensitivity.
117
+
118
+ - **Feature 2: Linguistic Feature Extraction**
119
+ - Extracts semantic embeddings, logits, distance from optimality, and semantic similarity.
120
+
121
+ Examples
122
+ ========
123
+
124
+ Here's a detailed usage example:
125
+
126
+ .. code-block:: python
127
+
128
+ from package_name import SomeClass
129
+
130
+ configuration_file = "config_fluency.yml"
131
+ pelican.run(configuration_file)
132
+
133
+ *Link to config_fluency.yml*
134
+
135
+ Sample folder for data collection of the semantic fluency task:
136
+ *Link to sample_folder*
137
+
138
+ Contributing
139
+ ============
140
+
141
+ Contributions are welcome! Please check out the `contributing guide <https://github.com/ypauli/PELICAN/blob/main/CONTRIBUTING.md>`_.
142
+
143
+ License
144
+ =======
145
+
146
+ This project is licensed under Attribution-NonCommercial 4.0 International. See the `LICENSE <https://github.com/ypauli/PELICAN/blob/main/LICENSE>`_ file for details.
@@ -0,0 +1,39 @@
1
+ pelican_nlp/__init__.py,sha256=yLyG5Amt7nuHQMHz0tuGUVsGBtgVmXT0LMeRo3an-OU,192
2
+ pelican_nlp/main.py,sha256=xKUqqA3sh9kbk07lKA_poILIU1c8oIeaSsVqPOPY5Tk,7596
3
+ pelican_nlp/core/__init__.py,sha256=whJc5dWsGsKn2IAw-D4BvCvUKW1sVtWYE1WJIuUr5uI,165
4
+ pelican_nlp/core/audio_document.py,sha256=hhSJNgeqSYa6_uws2ho66agHhAdHuKN3EIEdIsIcXKg,586
5
+ pelican_nlp/core/corpus.py,sha256=6pDRmeO0XoHylhjLE4Fi5Tc3HCMQJ-Xk0YRzEfz5Z1Y,15168
6
+ pelican_nlp/core/document.py,sha256=j2HP5FX6cfmXHo7OWVFCX6cMsDyqsOmNlnGNNNfCm2c,8467
7
+ pelican_nlp/core/subject.py,sha256=-pi3jDzb2zLiG8JNAi9i-9Jd-VtsPxDO4ShQci2QSMg,1059
8
+ pelican_nlp/extraction/__init__.py,sha256=hfqFiaKpQBS6cwRm9Yd7MpOcV60_xJmwuQ2Kegary5k,84
9
+ pelican_nlp/extraction/acoustic_feature_extraction.py,sha256=6Csrr6uotarhuAzxYlGFAil9K4PLUqa9vWw607peRoA,2319
10
+ pelican_nlp/extraction/distance_from_randomness.py,sha256=yikZ3GK2dqpzuNFPVsjuUK0lo6kHOIoIhKPaVrGXRMQ,3365
11
+ pelican_nlp/extraction/extract_embeddings.py,sha256=e5bcNlskd7f-JkWtfd7YutGV5bqcURKrAkETRyTx93Q,2457
12
+ pelican_nlp/extraction/extract_logits.py,sha256=Lc7Es86T8mlSvLMhiDHpFdCc0kCZ9fNr3-VFnOyeybs,3869
13
+ pelican_nlp/extraction/language_model.py,sha256=4tHJZIRCEeHVTwEf2jmOtu-zDGkdXiDjKmlpuxDuLiw,2929
14
+ pelican_nlp/extraction/semantic_similarity.py,sha256=QhY5CAOAorxEo3UBWPlMegFvbySF0KH6j4j3m2I3_NY,2552
15
+ pelican_nlp/extraction/test_documents/test_features.csv,sha256=LR_3m4vIm-YWKw5gI5ziswhS-NF9VhKv14c2udLxtJU,488482
16
+ pelican_nlp/extraction/test_documents/wallace_1.15_3.txt,sha256=ShXxOHUZzGPNUqIcOn6-OYkarzNtTC22V05a_Xpvtlw,3731
17
+ pelican_nlp/extraction/test_documents/wallace_1.1_3.txt,sha256=gs5REE10myK3Nm9JBOV8hjqKcMRkrl7BasuK7HSBe5M,3695
18
+ pelican_nlp/extraction/test_documents/wallace_1_4.txt,sha256=95Z7gS92KERCocrbOAFbJntf5QoE-6p0GL67XQEffqI,3963
19
+ pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py,sha256=svXXyLEA62mLa0KUfSiOSFFMjYk17K7BJbxUoLf0l9w,1468
20
+ pelican_nlp/preprocessing/LPDS.py,sha256=4UWkMMSrdU-nWVi8eKiWQSGD7f7lemB42aI0fFn6ZLU,4097
21
+ pelican_nlp/preprocessing/__init__.py,sha256=ZYgOUlKPXmltYez3urPZmsAWRWSEqZ3_l_gN2aqd15s,293
22
+ pelican_nlp/preprocessing/pipeline.py,sha256=t2zJAvZRO12MdAKQgm8XZxfZND7_8gFtzHF9Rq2L2aE,1796
23
+ pelican_nlp/preprocessing/speaker_diarization.py,sha256=N6dZCa2AHHGw__g9e-ZUyZM_In0-nzFOkZ44cBnoKLk,1122
24
+ pelican_nlp/preprocessing/text_cleaner.py,sha256=QKqxwoRR8dnuBYiY-PXK1kB7744TVUcUMJb7dbKvXGk,7512
25
+ pelican_nlp/preprocessing/text_importer.py,sha256=FtSyJjFXDxVle7Jpyw6EqCLDbLTCRxqVQi9ymWWtPB4,1356
26
+ pelican_nlp/preprocessing/text_normalizer.py,sha256=huo5VFqJ0p2jq-ud1047XvMu1qNeaiuG879SF3zkJoM,894
27
+ pelican_nlp/preprocessing/text_tokenizer.py,sha256=h875bXr0YuMrLh4HtQUvpHmASScddtkQXGaF9mm7uwU,1642
28
+ pelican_nlp/sample_configuration_files/config_discourse.yml,sha256=xVHIUpSORV6iR0nEvuess6rfiAvuGEkqmaMWD_6kyFE,3618
29
+ pelican_nlp/sample_configuration_files/config_fluency.yml,sha256=oQ6Y2BhRLExEMpS3VRH2pFrGHi788L66aSYUm05nV_A,3038
30
+ pelican_nlp/sample_configuration_files/config_general.yml,sha256=UuGnZUa-SVmioE9NmXWOMKuv3uG5mNjIuXgA6-Y0JS0,3440
31
+ pelican_nlp/utils/__init__.py,sha256=q1tGdOOj5UPRC2mGhoMUh8p4cbFCkkbD21bQaOVvFao,189
32
+ pelican_nlp/utils/csv_functions.py,sha256=hsG73gm3Up9sAerp6gIxuNHaeP1vJj6HSh7ggVm1SSo,7272
33
+ pelican_nlp/utils/sample_usage.py,sha256=W__OVMjWND-ZtxxRhfGJDHwbVpGlB-anXDxyA5P4cME,353
34
+ pelican_nlp/utils/setup_functions.py,sha256=s0QcarswU8qeFBcEQNIYC1ooaD-xwRiTJn--yPEId8E,3612
35
+ pelican_nlp-0.1.0.dist-info/licenses/LICENSE,sha256=m3jshBZIXKiBX6qhmhtJcLTVJ1N6BEkQGIflneXvpYg,19336
36
+ pelican_nlp-0.1.0.dist-info/METADATA,sha256=kIWgpFUOeQC1c-DYvSPoN82OXBgV7TJtPLUGLNC5KDs,4947
37
+ pelican_nlp-0.1.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
38
+ pelican_nlp-0.1.0.dist-info/top_level.txt,sha256=F0qlyqy5FCd3sTS_npUYPeLKN9_BZq6wD4qo9pI0xbg,12
39
+ pelican_nlp-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (78.1.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+