pelican-nlp 0.2.6__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. {pelican_nlp-0.2.6/pelican_nlp.egg-info → pelican_nlp-0.3.0}/PKG-INFO +3 -3
  2. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/README.rst +2 -2
  3. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_discourse/config_discourse.yml +0 -1
  4. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/config_image-descriptions.yml +0 -1
  5. pelican_nlp-0.3.0/pelican_nlp/_version.py +1 -0
  6. pelican_nlp-0.3.0/pelican_nlp/config.py +14 -0
  7. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/core/corpus.py +26 -30
  8. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/core/subject.py +3 -3
  9. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/extraction/acoustic_feature_extraction.py +1 -1
  10. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/extraction/extract_embeddings.py +3 -1
  11. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/extraction/extract_logits.py +4 -2
  12. pelican_nlp-0.3.0/pelican_nlp/extraction/language_model.py +125 -0
  13. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/main.py +49 -27
  14. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/LPDS.py +9 -6
  15. {pelican_nlp-0.2.6/examples/PyPI_testing_fluency → pelican_nlp-0.3.0/pelican_nlp/sample_configuration_files}/config_fluency.yml +0 -1
  16. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/utils/csv_functions.py +45 -35
  17. pelican_nlp-0.3.0/pelican_nlp/utils/filename_parser.py +23 -0
  18. pelican_nlp-0.3.0/pelican_nlp/utils/setup_functions.py +118 -0
  19. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0/pelican_nlp.egg-info}/PKG-INFO +3 -3
  20. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp.egg-info/SOURCES.txt +2 -0
  21. pelican_nlp-0.2.6/pelican_nlp/_version.py +0 -1
  22. pelican_nlp-0.2.6/pelican_nlp/extraction/language_model.py +0 -71
  23. pelican_nlp-0.2.6/pelican_nlp/utils/setup_functions.py +0 -92
  24. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/LICENSE +0 -0
  25. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/MANIFEST.in +0 -0
  26. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_discourse/subjects/sub-01/interview/sub-01_interview_schizophrenia_run-01.rtf +0 -0
  27. {pelican_nlp-0.2.6/pelican_nlp/configuration_files → pelican_nlp-0.3.0/examples/PyPI_testing_fluency}/config_fluency.yml +0 -0
  28. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_fluency/subjects/sub-01/fluency/sub-01_fluency_sem_animals.txt +0 -0
  29. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_fluency/subjects/sub-01/fluency/sub-01_fluency_sem_clothes.txt +0 -0
  30. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_fluency/subjects/sub-01/fluency/sub-01_fluency_sem_food.txt +0 -0
  31. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_fluency/subjects/sub-02/fluency/sub-02_fluency_sem_animals.txt +0 -0
  32. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_fluency/subjects/sub-02/fluency/sub-02_fluency_sem_clothes.txt +0 -0
  33. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_fluency/subjects/sub-02/fluency/sub-02_fluency_sem_food.txt +0 -0
  34. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_image-description_drug.docx +0 -0
  35. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_image-description_placebo.docx +0 -0
  36. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_image-description_drug.docx +0 -0
  37. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_image-description_placebo.docx +0 -0
  38. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_image-description_drug.docx +0 -0
  39. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/examples/PyPI_testing_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_image-description_placebo.docx +0 -0
  40. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/__init__.py +0 -0
  41. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/extract_acoustic_features.py +0 -0
  42. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
  43. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +0 -0
  44. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/behavioral_data.py +0 -0
  45. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/check_duplicates.py +0 -0
  46. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/coherence.py +0 -0
  47. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/config.py +0 -0
  48. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/main.py +0 -0
  49. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +0 -0
  50. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/plot_fluency.py +0 -0
  51. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/plotting_utils.py +0 -0
  52. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/questionnaires_data.py +0 -0
  53. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/stats_fluency.py +0 -0
  54. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/fluency/utils.py +0 -0
  55. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/speaker_diarization_Nils.py +0 -0
  56. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
  57. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/annotation_tool.py +0 -0
  58. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +0 -0
  59. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +0 -0
  60. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +0 -0
  61. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/test.json +0 -0
  62. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/transcribe_audio.py +0 -0
  63. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +0 -0
  64. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/transcription.py +0 -0
  65. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/transcription_gui.py +0 -0
  66. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Nils_backup/transcription/word_boundaries.py +0 -0
  67. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +0 -0
  68. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/Silvia_files/prosogram/prosogram.py +0 -0
  69. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/__init__.py +0 -0
  70. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/cli.py +0 -0
  71. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/configuration_files/config_audio.yml +0 -0
  72. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/configuration_files/config_discourse.yml +0 -0
  73. {pelican_nlp-0.2.6/pelican_nlp/sample_configuration_files → pelican_nlp-0.3.0/pelican_nlp/configuration_files}/config_fluency.yml +0 -0
  74. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/configuration_files/config_general.yml +0 -0
  75. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/configuration_files/config_morteza.yml +0 -0
  76. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/core/__init__.py +0 -0
  77. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/core/audio_document.py +0 -0
  78. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/core/document.py +0 -0
  79. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/extraction/__init__.py +0 -0
  80. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/extraction/distance_from_randomness.py +0 -0
  81. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/extraction/semantic_similarity.py +0 -0
  82. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/extraction/test_documents/test_features.csv +0 -0
  83. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/extraction/test_documents/wallace_1.15_3.txt +0 -0
  84. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/extraction/test_documents/wallace_1.1_3.txt +0 -0
  85. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/extraction/test_documents/wallace_1_4.txt +0 -0
  86. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py +0 -0
  87. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/praat/__init__.py +0 -0
  88. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/__init__.py +0 -0
  89. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/pipeline.py +0 -0
  90. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/speaker_diarization.py +0 -0
  91. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/text_cleaner.py +0 -0
  92. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/text_importer.py +0 -0
  93. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/text_normalizer.py +0 -0
  94. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/preprocessing/text_tokenizer.py +0 -0
  95. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/sample_configuration_files/config_discourse.yml +0 -0
  96. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/sample_configuration_files/config_general.yml +0 -0
  97. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/utils/__init__.py +0 -0
  98. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp/utils/sample_usage.py +0 -0
  99. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp.egg-info/dependency_links.txt +0 -0
  100. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp.egg-info/entry_points.txt +0 -0
  101. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp.egg-info/requires.txt +0 -0
  102. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pelican_nlp.egg-info/top_level.txt +0 -0
  103. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/pyproject.toml +0 -0
  104. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/requirements.txt +0 -0
  105. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/setup.cfg +0 -0
  106. {pelican_nlp-0.2.6 → pelican_nlp-0.3.0}/tests/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pelican_nlp
3
- Version: 0.2.6
3
+ Version: 0.3.0
4
4
  Summary: Preprocessing and Extraction of Linguistic Information for Computational Analysis
5
5
  Author-email: Yves Pauli <yves.pauli@gmail.com>
6
6
  License-Expression: CC-BY-NC-4.0
@@ -69,7 +69,7 @@ Create conda environment
69
69
 
70
70
  .. code-block:: bash
71
71
 
72
- conda create -n pelican-nlp python=3.10
72
+ conda create -n pelican-nlp -c defaults python=3.10
73
73
 
74
74
  Activate environment
75
75
 
@@ -157,7 +157,7 @@ Features
157
157
  Examples
158
158
  ========
159
159
 
160
- You can find example setups on the github repository in the 'examples` folder: https://github.com/ypauli/pelican_nlp/tree/main/examples
160
+ You can find example setups on the github repository in the `examples <https://github.com/ypauli/pelican_nlp/tree/main/examples>`_ folder:
161
161
 
162
162
  Contributing
163
163
  ============
@@ -23,7 +23,7 @@ Create conda environment
23
23
 
24
24
  .. code-block:: bash
25
25
 
26
- conda create -n pelican-nlp python=3.10
26
+ conda create -n pelican-nlp -c defaults python=3.10
27
27
 
28
28
  Activate environment
29
29
 
@@ -111,7 +111,7 @@ Features
111
111
  Examples
112
112
  ========
113
113
 
114
- You can find example setups on the github repository in the 'examples` folder: https://github.com/ypauli/pelican_nlp/tree/main/examples
114
+ You can find example setups on the github repository in the `examples <https://github.com/ypauli/pelican_nlp/tree/main/examples>`_ folder:
115
115
 
116
116
  Contributing
117
117
  ============
@@ -5,7 +5,6 @@ discourse: &discourse_flag true
5
5
  #=====================================
6
6
 
7
7
  #general configurations; always adapt
8
- PATH_TO_PROJECT_FOLDER: "/home/yvespauli/PycharmProjects/PyPI_testing_discourse"
9
8
  language: "german" # Possibly add options for German and English
10
9
 
11
10
  task_name: "interview" # Give name of task used for creation of the input file (e.g., ['fluency', 'interview'])
@@ -4,7 +4,6 @@
4
4
  # Basic Settings
5
5
  # -------------
6
6
  input_file: "text" # Options: 'text' or 'audio'
7
- PATH_TO_PROJECT_FOLDER: "/home/yvespauli/PycharmProjects/PyPI_testing_image-descriptions"
8
7
  language: "german" # Options: 'german', 'english'
9
8
  recompute_everything: true # If false, reuses previously computed results
10
9
 
@@ -0,0 +1 @@
1
+ __version__ = "0.3.0"
@@ -0,0 +1,14 @@
1
+ """
2
+ Global configuration settings for the Pelican project.
3
+
4
+ This file is not the configuration.yml file created for the users adaptations.
5
+ For consistency of pipeline, DO NOT CHANGE.
6
+ """
7
+
8
+ # Debug flag
9
+ DEBUG_MODE = False
10
+
11
+ def debug_print(*args, **kwargs):
12
+ """Print only if debug mode is enabled."""
13
+ if DEBUG_MODE:
14
+ print(*args, **kwargs)
@@ -15,20 +15,24 @@ import os
15
15
  import pandas as pd
16
16
  import re
17
17
 
18
+ from pelican_nlp.config import debug_print
19
+
18
20
  class Corpus:
19
21
  def __init__(self, corpus_name, documents, configuration_settings, project_folder):
20
22
  self.name = corpus_name
23
+ self.key = corpus_name.split('-')[0]
24
+ self.value = corpus_name.split('-')[1]
21
25
  self.documents = documents
22
26
  self.config = configuration_settings
23
27
  self.project_folder = project_folder
24
- self.derivative_dir = project_folder / 'derivatives'
28
+ self.derivatives_dir = project_folder / 'derivatives'
25
29
  self.pipeline = TextPreprocessingPipeline(self.config)
26
30
  self.task = configuration_settings['task_name']
27
31
  self.results_path = None
28
32
 
29
33
  def preprocess_all_documents(self):
30
34
  """Preprocess all documents"""
31
- print('Preprocessing all documents...')
35
+ print(f'Preprocessing all documents of corpus {self.name}...')
32
36
  for document in self.documents:
33
37
  document.detect_sections()
34
38
  document.process_document(self.pipeline)
@@ -43,21 +47,15 @@ class Corpus:
43
47
  """Create separate aggregated results CSV files for each metric."""
44
48
  print("Creating aggregated results files per metric...")
45
49
 
46
- try:
47
- derivatives_path = os.path.dirname(os.path.dirname(self.documents[0].results_path))
48
- except (AttributeError, IndexError):
49
- print("Error: No valid results path found in documents")
50
- return
51
-
52
50
  # Create aggregations folder
53
- aggregation_path = os.path.join(derivatives_path, 'aggregations')
51
+ aggregation_path = os.path.join(self.derivatives_dir, 'aggregations')
54
52
  os.makedirs(aggregation_path, exist_ok=True)
55
53
 
56
54
  # Initialize results dictionary with metrics as keys
57
55
  results_by_metric = {}
58
56
 
59
57
  # Walk through all directories in derivatives
60
- for root, dirs, files in os.walk(derivatives_path):
58
+ for root, dirs, files in os.walk(self.derivatives_dir):
61
59
  # Skip the aggregations directory itself
62
60
  if 'aggregations' in root:
63
61
  continue
@@ -115,6 +113,7 @@ class Corpus:
115
113
  logits_options = self.config['options_logits']
116
114
 
117
115
  print('logits extraction in progress')
116
+
118
117
  model_name = logits_options['model_name']
119
118
  logitsExtractor = LogitsExtractor(logits_options,
120
119
  self.pipeline,
@@ -144,7 +143,7 @@ class Corpus:
144
143
 
145
144
  #'logits' list of dictionaries; keys token, logprob_actual, logprob_max, entropy, most_likely_token
146
145
  store_features_to_csv(logits,
147
- self.derivative_dir,
146
+ self.derivatives_dir,
148
147
  self.documents[i],
149
148
  metric='logits')
150
149
 
@@ -154,9 +153,12 @@ class Corpus:
154
153
  embedding_options = self.config['options_embeddings']
155
154
  print('Embeddings extraction in progress...')
156
155
  embeddingsExtractor = EmbeddingsExtractor(embedding_options, self.project_folder)
156
+ debug_print(len(self.documents))
157
157
  for i in range(len(self.documents)):
158
+
159
+ debug_print(f'cleaned sections: {self.documents[i].cleaned_sections}')
158
160
  for key, section in self.documents[i].cleaned_sections.items():
159
- print(f'Processing section {key}')
161
+ debug_print(f'Processing section {key}')
160
162
 
161
163
  if self.config['discourse']:
162
164
  section = TextDiarizer.parse_speaker(section, self.config['subject_speakertag'], embedding_options['keep_speakertags'])
@@ -175,7 +177,7 @@ class Corpus:
175
177
  from pelican_nlp.extraction.semantic_similarity import calculate_semantic_similarity, \
176
178
  get_semantic_similarity_windows
177
179
  consecutive_similarities, mean_similarity = calculate_semantic_similarity(utterance)
178
- print(f'Mean semantic similarity: {mean_similarity:.4f}')
180
+ debug_print(f'Mean semantic similarity: {mean_similarity:.4f}')
179
181
 
180
182
  for window_size in self.config['options_semantic-similarity']['window_sizes']:
181
183
  window_stats = get_semantic_similarity_windows(utterance, window_size)
@@ -187,7 +189,7 @@ class Corpus:
187
189
  'std_of_window_stds': window_stats[3],
188
190
  'mean_of_window_medians': window_stats[4]
189
191
  }
190
- print(f'Window {window_size} stats - mean: {window_stats[0]:.4f}, std: {window_stats[1]:.4f}, median: {window_stats[4]:.4f}')
192
+ debug_print(f'Window {window_size} stats - mean: {window_stats[0]:.4f}, std: {window_stats[1]:.4f}, median: {window_stats[4]:.4f}')
191
193
  else:
192
194
  window_data = {
193
195
  'mean': window_stats[0] if isinstance(window_stats, tuple) else window_stats,
@@ -195,16 +197,16 @@ class Corpus:
195
197
  }
196
198
 
197
199
  store_features_to_csv(window_data,
198
- self.derivative_dir,
200
+ self.derivatives_dir,
199
201
  self.documents[i],
200
202
  metric=f'semantic-similarity-window-{window_size}')
201
203
 
202
204
  if self.config['options_embeddings']['distance-from-randomness']:
203
205
  from pelican_nlp.extraction.distance_from_randomness import get_distance_from_randomness
204
206
  divergence = get_distance_from_randomness(utterance, self.config["options_dis_from_randomness"])
205
- print(f'Divergence from optimality metrics: {divergence}')
207
+ debug_print(f'Divergence from optimality metrics: {divergence}')
206
208
  store_features_to_csv(divergence,
207
- self.derivative_dir,
209
+ self.derivatives_dir,
208
210
  self.documents[i],
209
211
  metric='distance-from-randomness')
210
212
 
@@ -230,7 +232,7 @@ class Corpus:
230
232
  cleaned_embeddings = utterance if isinstance(utterance, list) else [(k, v) for k, v in utterance.items()]
231
233
 
232
234
  store_features_to_csv(cleaned_embeddings,
233
- self.derivative_dir,
235
+ self.derivatives_dir,
234
236
  self.documents[i],
235
237
  metric='embeddings')
236
238
  return
@@ -241,11 +243,11 @@ class Corpus:
241
243
  results, recording_length = AudioFeatureExtraction.opensmile_extraction(self.documents[i].file, self.config['opensmile_configurations'])
242
244
  self.documents[i].recording_length = recording_length # Store the recording length
243
245
  results['subject_ID'] = self.documents[i].subject_ID # Set the subject ID
244
- print('results obtained')
246
+ print('opensmile results obtained')
245
247
  store_features_to_csv(results,
246
- self.derivative_dir,
247
- self.documents[i],
248
- metric='opensmile-features')
248
+ self.derivatives_dir,
249
+ self.documents[i],
250
+ metric='opensmile-features')
249
251
 
250
252
  def extract_prosogram(self):
251
253
  from pelican_nlp.extraction.acoustic_feature_extraction import AudioFeatureExtraction
@@ -257,14 +259,8 @@ class Corpus:
257
259
  """Create CSV file with summarized document parameters based on config specifications."""
258
260
  print("Creating document information summary...")
259
261
 
260
- try:
261
- derivatives_path = os.path.dirname(os.path.dirname(self.documents[0].results_path))
262
- except (AttributeError, IndexError):
263
- print("Error: No valid results path found in documents")
264
- return
265
-
266
262
  # Create document_information folder inside aggregations
267
- doc_info_path = os.path.join(derivatives_path, 'aggregations', 'document_information')
263
+ doc_info_path = os.path.join(self.derivatives_dir, 'aggregations', 'document_information')
268
264
  os.makedirs(doc_info_path, exist_ok=True)
269
265
 
270
266
  # Define output file path
@@ -293,4 +289,4 @@ class Corpus:
293
289
  # Convert to DataFrame and save to CSV
294
290
  df = pd.DataFrame(document_info)
295
291
  df.to_csv(output_file, index=False)
296
- print(f"Document information saved to: {output_file}")
292
+ debug_print(f"Document information saved to: {output_file}")
@@ -4,12 +4,12 @@ The Subject class stores all subject specific information and a list of correspo
4
4
  """
5
5
 
6
6
  class Subject:
7
- def __init__(self, subjectID, description=None):
7
+ def __init__(self, name, description=None):
8
8
 
9
- self.subjectID = subjectID
9
+ self.name = name
10
+ self.subjectID = None
10
11
  self.gender = None
11
12
  self.age = None
12
- self.name = None
13
13
  self.description = description # Description of the subject
14
14
  self.documents = [] # List of TextDocument instances
15
15
  self.numberOfSessions = None
@@ -49,7 +49,7 @@ class AudioFeatureExtraction:
49
49
  profile (DataFrame): Prosogram analysis results
50
50
  """
51
51
  import parselmouth
52
- from pelican.praat import PROSOGRAM_SCRIPT
52
+ from pelican_nlp.praat import PROSOGRAM_SCRIPT
53
53
  try:
54
54
  sound = parselmouth.Sound(file)
55
55
  # Common Prosogram parameters
@@ -1,6 +1,8 @@
1
1
  from pelican_nlp.extraction.language_model import Model
2
2
  from pelican_nlp.preprocessing.text_tokenizer import TextTokenizer
3
3
 
4
+ from pelican_nlp.config import debug_print
5
+
4
6
  class EmbeddingsExtractor:
5
7
  def __init__(self, embeddings_configurations, project_path):
6
8
  self.embeddings_configurations = embeddings_configurations
@@ -22,7 +24,7 @@ class EmbeddingsExtractor:
22
24
 
23
25
  # Tokenize the input text
24
26
  inputs = self.Tokenizer.tokenize_text(text)
25
- print(f'inputs are: {inputs}')
27
+ debug_print(f'inputs are: {inputs}')
26
28
 
27
29
  if self.embeddings_configurations['pytorch_based_model']:
28
30
  #e.g. RoBERTa Model or Llama Model
@@ -2,6 +2,8 @@ import torch
2
2
  import torch.nn.functional as F
3
3
  from tqdm import tqdm
4
4
 
5
+ from pelican_nlp.config import debug_print
6
+
5
7
  class LogitsExtractor:
6
8
  def __init__(self, options, pipeline, project_path):
7
9
 
@@ -13,9 +15,9 @@ class LogitsExtractor:
13
15
 
14
16
  def extract_features(self, section, tokenizer, model):
15
17
 
16
- print(f'section to tokenize: {section}')
18
+ debug_print(f'section to tokenize: {section}')
17
19
  tokens = tokenizer.tokenize_text(section)
18
- print(tokens)
20
+ debug_print(tokens)
19
21
 
20
22
  chunk_size = self.options['chunk_size']
21
23
  overlap_size = self.options['overlap_size']
@@ -0,0 +1,125 @@
1
+ import torch
2
+ import psutil
3
+ import os
4
+ import shutil
5
+
6
+ from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
7
+ from transformers import AutoModelForCausalLM
8
+
9
+ class Model:
10
+ def __init__(self, model_name, project_path):
11
+ self.model_name = model_name
12
+ self.model_instance = None
13
+ self.device_map = None
14
+ self.PROJECT_PATH = project_path
15
+
16
+ def load_model(self, empty_weights=False, trust_remote_code=False):
17
+ """Loads and configures the model"""
18
+
19
+ if self.model_name == 'fastText':
20
+ import fasttext
21
+ import fasttext.util
22
+
23
+ # Create a model directory if it doesn't exist
24
+ model_dir = os.path.join(os.path.expanduser('~'), '.fasttext')
25
+ os.makedirs(model_dir, exist_ok=True)
26
+
27
+ # Set the model path using proper OS path joining
28
+ model_path = os.path.join(model_dir, 'cc.de.300.bin')
29
+
30
+ # Download only if model doesn't exist or is invalid
31
+ need_download = True
32
+ if os.path.exists(model_path):
33
+ try:
34
+ self.model_instance = fasttext.load_model(model_path)
35
+ need_download = False
36
+ except ValueError:
37
+ print(f"Existing model file is corrupted, re-downloading...")
38
+ os.remove(model_path)
39
+
40
+ if need_download:
41
+ print("Downloading FastText model...")
42
+ try:
43
+ # Try the built-in FastText downloader first
44
+ fasttext.util.download_model('de', if_exists='ignore')
45
+ # Find the downloaded file in current directory
46
+ downloaded_file = 'cc.de.300.bin'
47
+ if os.path.exists(downloaded_file):
48
+ # Move the file to the correct location
49
+ shutil.move(downloaded_file, model_path)
50
+ else:
51
+ raise FileNotFoundError("FastText downloader didn't create the expected file")
52
+ except (OSError, ValueError, FileNotFoundError) as e:
53
+ print(f"FastText downloader failed, using direct download: {str(e)}")
54
+ # Direct download fallback
55
+ import urllib.request
56
+ url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz'
57
+ print(f"Downloading from {url}...")
58
+ temp_gz_path = model_path + '.gz'
59
+ urllib.request.urlretrieve(url, temp_gz_path)
60
+
61
+ # Decompress the file
62
+ print("Decompressing model file...")
63
+ import gzip
64
+ with gzip.open(temp_gz_path, 'rb') as f_in:
65
+ with open(model_path, 'wb') as f_out:
66
+ f_out.write(f_in.read())
67
+ os.remove(temp_gz_path)
68
+ print("Model decompressed successfully")
69
+
70
+ # Verify the downloaded model
71
+ try:
72
+ self.model_instance = fasttext.load_model(model_path)
73
+ except ValueError as e:
74
+ raise ValueError(f"Failed to load downloaded model: {str(e)}. Please try removing {model_path} and running again.")
75
+
76
+ print(f'FastText model loaded successfully from {model_path}')
77
+ elif self.model_name == 'xlm-roberta-base':
78
+ from transformers import AutoModel
79
+ self.model_instance = AutoModel.from_pretrained(
80
+ self.model_name,
81
+ trust_remote_code=trust_remote_code,
82
+ use_safetensors=True
83
+ )
84
+ print('RoBERTa model loaded.')
85
+ elif self.model_name == 'DiscoResearch/Llama3-German-8B-32k':
86
+ if empty_weights:
87
+ with init_empty_weights():
88
+ self.model_instance = AutoModelForCausalLM.from_pretrained(
89
+ self.model_name,
90
+ trust_remote_code=trust_remote_code,
91
+ use_safetensors=True
92
+ )
93
+ else:
94
+ self.model_instance = AutoModelForCausalLM.from_pretrained(
95
+ self.model_name,
96
+ trust_remote_code=trust_remote_code,
97
+ use_safetensors=True
98
+ )
99
+ print(f'Llama3-German-8B-32k loaded')
100
+ else:
101
+ raise ValueError("Invalid model name.")
102
+
103
+ if self.model_name == 'xlm-roberta-base' or self.model_name == 'DiscoResearch/Llama3-German-8B-32k':
104
+ # Additional model setup
105
+ self.device_map_creation()
106
+
107
+ self.model_instance = dispatch_model(self.model_instance, device_map=self.device_map)
108
+ print('Model dispatched to appropriate devices.')
109
+
110
+ def device_map_creation(self):
111
+ #check if cuda is available
112
+ if not torch.cuda.is_available():
113
+ print('Careful: Cuda not available, using CPU. This can be slow. Consider running pipeline on different device')
114
+ else:
115
+ print(f'{torch.cuda.get_device_name(0)} available.')
116
+
117
+ available_VRAM = str(int(torch.cuda.get_device_properties(0).total_memory/(1024 ** 3))-3)+'GB'
118
+ available_RAM = str(int(psutil.virtual_memory().total/(1024 ** 3))-3)+'GB'
119
+
120
+ #create device map and offload directory if it doesn't exist
121
+ self.device_map = infer_auto_device_map(self.model_instance, max_memory={
122
+ 0: available_VRAM,
123
+ 'cpu': available_RAM,
124
+ 'disk': '200GB'
125
+ })
@@ -1,9 +1,9 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- Pelican Project
4
- ===============
3
+ Pelican-nlp Project
4
+ ===================
5
5
 
6
- Pelican is a tool developed to enable consistent and reproducible language processing.
6
+ Pelican-nlp is a tool developed to enable consistent and reproducible language processing.
7
7
  Main entry point for the Pelican project handling document processing and metric extraction.
8
8
 
9
9
  Author: Yves Pauli
@@ -23,6 +23,9 @@ import sys
23
23
  from pelican_nlp.core import Corpus
24
24
  from pelican_nlp.utils.setup_functions import subject_instantiator, load_config, remove_previous_derivative_dir
25
25
  from pelican_nlp.preprocessing import LPDS
26
+ from pelican_nlp.utils.filename_parser import parse_lpds_filename
27
+
28
+ from config import debug_print
26
29
 
27
30
  project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_fluency/config_fluency.yml'
28
31
 
@@ -30,7 +33,8 @@ class Pelican:
30
33
 
31
34
  """Main class for the Pelican project handling document processing and metric extraction."""
32
35
 
33
- def __init__(self, config_path: str = None, dev_mode: bool = True) -> None:
36
+ def __init__(self, config_path: str = None, dev_mode: bool = False) -> None:
37
+
34
38
  self.dev_mode = dev_mode
35
39
 
36
40
  # If no config path is provided, use the default config from package; used for dev-mode
@@ -83,23 +87,25 @@ class Pelican:
83
87
  subjects = subject_instantiator(self.config, self.project_path)
84
88
 
85
89
  # Process each corpus
86
- for corpus_name in self.config['corpus_names']:
87
- self._process_corpus(corpus_name, subjects)
88
-
89
- def _process_corpus(self, corpus_name: str, subjects: List) -> None:
90
+ for corpus_value in self.config['corpus_values']:
91
+ self._process_corpus(self.config['corpus_key'], corpus_value, subjects)
90
92
 
93
+ def _process_corpus(self, corpus_key: str, corpus_value: str, subjects: List) -> None:
91
94
  """Process a single corpus including preprocessing and metric extraction."""
92
- print(f'Processing corpus: {corpus_name}')
93
95
 
94
- corpus_documents = self._identify_corpus_files(subjects, corpus_name)
95
- corpus = Corpus(corpus_name, corpus_documents[corpus_name], self.config, self.project_path)
96
+ corpus_entity = corpus_key + '-' + corpus_value
97
+ print(f'Processing corpus: {corpus_entity}')
98
+ debug_print(subjects, corpus_entity)
99
+ corpus_documents = self._identify_corpus_files(subjects, corpus_entity)
100
+ debug_print(len(corpus_documents))
101
+ corpus = Corpus(corpus_entity, corpus_documents[corpus_entity], self.config, self.project_path)
96
102
 
97
- for document in corpus_documents[corpus_name]:
98
- document.corpus_name = corpus_name
103
+ for document in corpus_documents[corpus_entity]:
104
+ document.corpus_name = corpus_entity
99
105
 
100
106
  if self.config['input_file']=='text':
101
107
  corpus.preprocess_all_documents()
102
- print(f'Corpus {corpus_name} is preprocessed')
108
+ print(f'Corpus {corpus_key} is preprocessed')
103
109
 
104
110
  self._extract_metrics(corpus)
105
111
 
@@ -140,18 +146,34 @@ class Pelican:
140
146
 
141
147
  self._clear_gpu_memory()
142
148
 
143
- def _identify_corpus_files(self, subjects: List, corpus: str) -> Dict:
144
- """Identify and group files belonging to a specific corpus."""
145
- corpus_dict = {corpus: []}
146
- for subject in subjects:
147
- for document in subject.documents:
148
- name = Path(document.name)
149
- document.extension = name.suffix
150
- # Split by both '_' and '.' to get all parts
151
- parts = name.stem.replace('.', '_').split('_')
152
- # Check if corpus name appears in any part
153
- if corpus in parts:
154
- corpus_dict[corpus].append(document)
149
+ def _identify_corpus_files(self, subjects: List, entity: str) -> Dict:
150
+ """Identify and group files based on specified entity-value pair."""
151
+ debug_print(f'identifying corpus files')
152
+ corpus_dict = {entity: []}
153
+ debug_print(len(subjects))
154
+
155
+ # Check if entity is in key-value format
156
+ if '-' in entity:
157
+ key, value = entity.split('-', 1)
158
+
159
+ for subject in subjects:
160
+ debug_print(subject.documents)
161
+ for document in subject.documents:
162
+ entities = parse_lpds_filename(document.name)
163
+ debug_print(entities)
164
+ if key in entities and str(entities[key]) == value:
165
+ corpus_dict[entity].append(document)
166
+ else:
167
+ # Entity is just a value, check all keys
168
+ for subject in subjects:
169
+ debug_print(subject.documents)
170
+ for document in subject.documents:
171
+ entities = parse_lpds_filename(document.name)
172
+ debug_print(entities)
173
+ # Convert all values to strings for comparison
174
+ if any(str(val) == entity for val in entities.values()):
175
+ corpus_dict[entity].append(document)
176
+
155
177
  return corpus_dict
156
178
 
157
179
  def _handle_output_directory(self) -> None:
@@ -207,4 +229,4 @@ class Pelican:
207
229
 
208
230
 
209
231
  if __name__ == '__main__':
210
- Pelican(project_path).run()
232
+ Pelican(project_path, dev_mode=True).run()
@@ -1,6 +1,8 @@
1
1
  import re
2
2
  import os
3
3
 
4
+ from pelican_nlp.config import debug_print
5
+
4
6
  class LPDS:
5
7
  def __init__(self, project_folder, multiple_sessions):
6
8
  self.project_folder = project_folder
@@ -18,7 +20,7 @@ class LPDS:
18
20
  suggested_files = ["dataset_description.json", "README", "CHANGES", "participants.tsv"]
19
21
  for file in suggested_files:
20
22
  if not os.path.isfile(os.path.join(self.project_folder, file)):
21
- print(f"Warning: Missing suggested file '{file}' in the project folder.")
23
+ debug_print(f"Warning: Missing suggested file '{file}' in the project folder.")
22
24
 
23
25
  # Check for the 'subjects' folder
24
26
  if not os.path.isdir(self.subjects_folder):
@@ -38,15 +40,16 @@ class LPDS:
38
40
  if self.multiple_sessions:
39
41
  session_folders = [f for f in os.listdir(subject_path) if
40
42
  os.path.isdir(os.path.join(subject_path, f))]
41
- if not session_folders:
43
+ if session_folders:
44
+ if 'ses-01' not in session_folders:
45
+ print(f"Warning: Ideally, the session folders should follow the naming convention 'ses-x'.")
46
+ else:
42
47
  print(f"Warning: No session folders found in '{subject_folder}'.")
43
- if 'ses-01' not in session_folders:
44
- print(f"Warning: Ideally, the session folders should follow the naming convention 'ses-x'.")
45
48
 
46
49
  # Check for optional subject_metadata file
47
50
  metadata_file = os.path.join(subject_path, "subject_metadata")
48
51
  if not os.path.isfile(metadata_file):
49
- #print(f"Note: Optional 'subject_metadata' file is missing in '{subject_folder}'.")
52
+ debug_print(f"Note: Optional 'subject_metadata' file is missing in '{subject_folder}'.")
50
53
  continue
51
54
 
52
55
  session_folders = subject_folder
@@ -68,7 +71,7 @@ class LPDS:
68
71
  else:
69
72
  pattern = fr"^{subject_folder}_{task_folder}.*"
70
73
  if not re.match(pattern, file):
71
- print(f"Warning: File '{file}' in '{task_folder}' does not follow the LPDS naming conventions")
74
+ debug_print(f"Warning: File '{file}' in '{task_folder}' does not follow the LPDS naming conventions")
72
75
 
73
76
  def derivative_dir_creator(self):
74
77
  # Create the 'derivatives' folder if it doesn't exist
@@ -5,7 +5,6 @@ fluency_task: &fluency_flag true
5
5
  #========================================
6
6
 
7
7
  #general configurations; always adapt
8
- PATH_TO_PROJECT_FOLDER: "/home/yvespauli/PycharmProjects/pelican_testing"
9
8
  language: "german"
10
9
  multiple_sessions: &session_flag false
11
10