pelican-nlp 0.3.2__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. {pelican_nlp-0.3.2/pelican_nlp.egg-info → pelican_nlp-0.3.3}/PKG-INFO +1 -1
  2. {pelican_nlp-0.3.2/examples/PyPI_testing_discourse → pelican_nlp-0.3.3/examples/example_discourse}/config_discourse.yml +13 -9
  3. {pelican_nlp-0.3.2/pelican_nlp/configuration_files → pelican_nlp-0.3.3/examples/example_fluency}/config_fluency.yml +2 -3
  4. pelican_nlp-0.3.3/examples/example_general/config_general.yml +146 -0
  5. pelican_nlp-0.3.3/pelican_nlp/_version.py +1 -0
  6. pelican_nlp-0.3.3/pelican_nlp/cli.py +34 -0
  7. pelican_nlp-0.3.3/pelican_nlp/config.py +35 -0
  8. pelican_nlp-0.3.3/pelican_nlp/extraction/extract_embeddings.py +106 -0
  9. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/extraction/extract_logits.py +6 -1
  10. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/extraction/language_model.py +1 -2
  11. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/main.py +9 -10
  12. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/preprocessing/text_tokenizer.py +7 -2
  13. pelican_nlp-0.3.3/pelican_nlp/project_graph/graph_visualization.py +109 -0
  14. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/sample_configuration_files/config_discourse.yml +14 -7
  15. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/sample_configuration_files/config_fluency.yml +2 -3
  16. pelican_nlp-0.3.3/pelican_nlp/sample_configuration_files/config_general.yml +146 -0
  17. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/utils/setup_functions.py +1 -1
  18. pelican_nlp-0.3.3/pelican_nlp/utils/unittests/examples/example_discourse/config_discourse.yml +109 -0
  19. pelican_nlp-0.3.3/pelican_nlp/utils/unittests/examples/example_fluency/config_fluency.yml +106 -0
  20. pelican_nlp-0.3.2/pelican_nlp/sample_configuration_files/config_general.yml → pelican_nlp-0.3.3/pelican_nlp/utils/unittests/examples/example_image-descriptions/config_image-descriptions.yml +25 -20
  21. pelican_nlp-0.3.3/pelican_nlp/utils/unittests/test_examples.py +211 -0
  22. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3/pelican_nlp.egg-info}/PKG-INFO +1 -1
  23. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp.egg-info/SOURCES.txt +15 -16
  24. pelican_nlp-0.3.2/examples/PyPI_testing_discourse/subjects/sub-01/interview/sub-01_interview_schizophrenia_run-01.rtf +0 -40
  25. pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_image-description_drug.docx +0 -0
  26. pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_image-description_placebo.docx +0 -0
  27. pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_image-description_drug.docx +0 -0
  28. pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_image-description_placebo.docx +0 -0
  29. pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_image-description_drug.docx +0 -0
  30. pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_image-description_placebo.docx +0 -0
  31. pelican_nlp-0.3.2/pelican_nlp/_version.py +0 -1
  32. pelican_nlp-0.3.2/pelican_nlp/cli.py +0 -18
  33. pelican_nlp-0.3.2/pelican_nlp/config.py +0 -14
  34. pelican_nlp-0.3.2/pelican_nlp/extraction/extract_embeddings.py +0 -59
  35. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/LICENSE +0 -0
  36. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/MANIFEST.in +0 -0
  37. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/README.rst +0 -0
  38. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/docs/images/pelican_logo.png +0 -0
  39. /pelican_nlp-0.3.2/examples/PyPI_testing_fluency/subjects/sub-01/fluency/sub-01_fluency_sem_animals.txt → /pelican_nlp-0.3.3/examples/example_fluency/subjects/sub-01/fluency/sub-01_task-fluency_cat-semantic_acq-animals_text.txt +0 -0
  40. /pelican_nlp-0.3.2/examples/PyPI_testing_fluency/subjects/sub-01/fluency/sub-01_fluency_sem_clothes.txt → /pelican_nlp-0.3.3/examples/example_fluency/subjects/sub-01/fluency/sub-01_task-fluency_cat-semantic_acq-clothes_text.txt +0 -0
  41. /pelican_nlp-0.3.2/examples/PyPI_testing_fluency/subjects/sub-01/fluency/sub-01_fluency_sem_food.txt → /pelican_nlp-0.3.3/examples/example_fluency/subjects/sub-01/fluency/sub-01_task-fluency_cat-semantic_acq-food_text.txt +0 -0
  42. /pelican_nlp-0.3.2/examples/PyPI_testing_fluency/subjects/sub-02/fluency/sub-02_fluency_sem_animals.txt → /pelican_nlp-0.3.3/examples/example_fluency/subjects/sub-02/fluency/sub-02_task-fluency_cat-semantic_acq-animals_text.txt +0 -0
  43. /pelican_nlp-0.3.2/examples/PyPI_testing_fluency/subjects/sub-02/fluency/sub-02_fluency_sem_clothes.txt → /pelican_nlp-0.3.3/examples/example_fluency/subjects/sub-02/fluency/sub-02_task-fluency_cat-semantic_acq-clothes_text.txt +0 -0
  44. /pelican_nlp-0.3.2/examples/PyPI_testing_fluency/subjects/sub-02/fluency/sub-02_fluency_sem_food.txt → /pelican_nlp-0.3.3/examples/example_fluency/subjects/sub-02/fluency/sub-02_task-fluency_cat-semantic_acq-food_text.txt +0 -0
  45. {pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions → pelican_nlp-0.3.3/examples/example_image-descriptions}/config_image-descriptions.yml +0 -0
  46. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/__init__.py +0 -0
  47. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/extract_acoustic_features.py +0 -0
  48. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
  49. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +0 -0
  50. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/fluency/behavioral_data.py +0 -0
  51. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/fluency/check_duplicates.py +0 -0
  52. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/fluency/coherence.py +0 -0
  53. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/fluency/config.py +0 -0
  54. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/fluency/main.py +0 -0
  55. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +0 -0
  56. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/fluency/plot_fluency.py +0 -0
  57. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/fluency/plotting_utils.py +0 -0
  58. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/fluency/questionnaires_data.py +0 -0
  59. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/fluency/stats_fluency.py +0 -0
  60. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/fluency/utils.py +0 -0
  61. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/speaker_diarization_Nils.py +0 -0
  62. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
  63. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/transcription/annotation_tool.py +0 -0
  64. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +0 -0
  65. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +0 -0
  66. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +0 -0
  67. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/transcription/test.json +0 -0
  68. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/transcription/transcribe_audio.py +0 -0
  69. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +0 -0
  70. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/transcription/transcription.py +0 -0
  71. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/transcription/transcription_gui.py +0 -0
  72. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Nils_backup/transcription/word_boundaries.py +0 -0
  73. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +0 -0
  74. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/Silvia_files/prosogram/prosogram.py +0 -0
  75. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/__init__.py +0 -0
  76. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/configuration_files/config_audio.yml +0 -0
  77. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/configuration_files/config_discourse.yml +0 -0
  78. {pelican_nlp-0.3.2/examples/PyPI_testing_fluency → pelican_nlp-0.3.3/pelican_nlp/configuration_files}/config_fluency.yml +0 -0
  79. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/configuration_files/config_general.yml +0 -0
  80. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/configuration_files/config_morteza.yml +0 -0
  81. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/core/__init__.py +0 -0
  82. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/core/audio_document.py +0 -0
  83. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/core/corpus.py +0 -0
  84. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/core/document.py +0 -0
  85. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/core/subject.py +0 -0
  86. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/extraction/__init__.py +0 -0
  87. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/extraction/acoustic_feature_extraction.py +0 -0
  88. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/extraction/distance_from_randomness.py +0 -0
  89. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/extraction/semantic_similarity.py +0 -0
  90. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/extraction/test_documents/test_features.csv +0 -0
  91. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/extraction/test_documents/wallace_1.15_3.txt +0 -0
  92. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/extraction/test_documents/wallace_1.1_3.txt +0 -0
  93. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/extraction/test_documents/wallace_1_4.txt +0 -0
  94. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py +0 -0
  95. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/praat/__init__.py +0 -0
  96. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/preprocessing/LPDS.py +0 -0
  97. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/preprocessing/__init__.py +0 -0
  98. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/preprocessing/pipeline.py +0 -0
  99. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/preprocessing/speaker_diarization.py +0 -0
  100. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/preprocessing/text_cleaner.py +0 -0
  101. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/preprocessing/text_importer.py +0 -0
  102. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/preprocessing/text_normalizer.py +0 -0
  103. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/utils/__init__.py +0 -0
  104. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/utils/csv_functions.py +0 -0
  105. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/utils/filename_parser.py +0 -0
  106. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp/utils/sample_usage.py +0 -0
  107. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp.egg-info/dependency_links.txt +0 -0
  108. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp.egg-info/entry_points.txt +0 -0
  109. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp.egg-info/requires.txt +0 -0
  110. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pelican_nlp.egg-info/top_level.txt +0 -0
  111. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/pyproject.toml +0 -0
  112. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/requirements.txt +0 -0
  113. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/setup.cfg +0 -0
  114. {pelican_nlp-0.3.2 → pelican_nlp-0.3.3}/tests/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pelican_nlp
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: Preprocessing and Extraction of Linguistic Information for Computational Analysis
5
5
  Author-email: Yves Pauli <yves.pauli@gmail.com>
6
6
  License-Expression: CC-BY-NC-4.0
@@ -7,11 +7,15 @@ discourse: &discourse_flag true
7
7
  #general configurations; always adapt
8
8
  language: "german" # Possibly add options for German and English
9
9
 
10
- task_name: "interview" # Give name of task used for creation of the input file (e.g., ['fluency', 'interview'])
11
- corpus_names:
10
+ task_name: "interview"
11
+
12
+ #Create analysis corpus, group files based on corpus entity.
13
+ corpus_key: "acq"
14
+ corpus_values: #group names
15
+ - "placebo"
12
16
  - "schizophrenia"
13
17
 
14
- metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
18
+ metric_to_extract: "logits" #Possible options: 'logits' or 'embeddings'
15
19
 
16
20
  number_of_speakers: 3
17
21
  subject_speakertag: "B"
@@ -43,7 +47,7 @@ options_logits:
43
47
  keep_speakertags: true
44
48
 
45
49
  options_embeddings:
46
- tokenization_method: "model_roberta" #or "whitespace", "model"
50
+ tokenization_method: "model" #"model" or "whitespace"
47
51
  max_length: 512 #max sequence length
48
52
  model_name: "xlm-roberta-base" #e.g. "fastText", "xlm-roberta-base"
49
53
  pytorch_based_model: true
@@ -59,10 +63,10 @@ options_embeddings:
59
63
  remove_punctuation_and_symbols: true
60
64
  remove_brackets_and_content: true
61
65
  semantic-similarity: false
66
+ distance-from-randomness: false
62
67
  window_size: null
63
68
  clean_tokens: false
64
-
65
- distance-from-randomness: false
69
+ divergence_from_optimality: false
66
70
  #================================================================================
67
71
 
68
72
  #Extra configurations:
@@ -93,13 +97,13 @@ normalization_options:
93
97
  method: "lemmatization" #Options: lemmatization or stemming
94
98
  #================================================================
95
99
 
100
+ create_aggregation_of_results: false
101
+ output_document_information: false
102
+
96
103
  #Detail configurations; Changes optional, mostly used for quality checking / error handling
97
104
  number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
98
105
  multiple_sessions: false # Set to True if multiple sessions per subject
99
106
 
100
107
  recompute_everything: true #If set to 'false' pelican-nlp will try to reuse previously computed results stored on your drive
101
108
 
102
- create_aggregation_of_results: false
103
- output_document_information: false
104
-
105
109
 
@@ -8,7 +8,8 @@ fluency_task: &fluency_flag true
8
8
  language: "german"
9
9
  multiple_sessions: &session_flag false
10
10
 
11
- corpus_names: #names of fluency tasks (e.g. "animals", "clothes")
11
+ corpus_key: "acq"
12
+ corpus_values: #names of fluency tasks (e.g. "animals", "clothes")
12
13
  - "animals"
13
14
  - "clothes"
14
15
  - "food"
@@ -103,5 +104,3 @@ filename_components:
103
104
  metric: true
104
105
  additional_tags: []
105
106
 
106
-
107
-
@@ -0,0 +1,146 @@
1
+ # Master Configuration File
2
+ # ========================
3
+
4
+ # Basic Settings
5
+ # -------------
6
+ input_file: "text" # Options: 'text' or 'audio'
7
+ language: "german"
8
+ recompute_everything: true # If false will give warning if output folder already exists
9
+
10
+ # Task Configuration
11
+ # -----------------
12
+ task_name: null # Name of task used for creation of data
13
+ fluency_task: &fluency_flag false # Flag for fluency-specific settings
14
+ discourse: &discourse_flag false # Flag for discourse-specific settings
15
+
16
+ # Corpus Configuration
17
+ # ------------------
18
+ corpus_key: null # Entity key to group files for analysis
19
+ corpus_values: # Corresponding entity values found in dataset
20
+ - "healthy-control"
21
+ - "placebo"
22
+
23
+ # Session and Subject Settings
24
+ # --------------------------
25
+ multiple_sessions: false
26
+ number_of_subjects: null # If null, auto-detected
27
+ number_of_speakers: 1 # Specify amount of speakers for discourse files
28
+ subject_speakertag: null # Speaker tag for subject (e.g., "B"), only for discourse
29
+
30
+ # Document Structure
31
+ # ----------------
32
+ has_multiple_sections: false
33
+ has_section_titles: false
34
+ section_identification: null # e.g., "Section:", in case of multiple sections
35
+ number_of_sections: null # If null, auto-detected, specify for multiple sections to check section detection
36
+
37
+ # Processing Pipeline
38
+ # -----------------
39
+ pipeline_options: # Just for data preprocessing without metric extraction
40
+ quality_check: false
41
+ clean_text: true
42
+ tokenize_text: false
43
+ normalize_text: false
44
+
45
+ # Metric Extraction
46
+ # ---------------
47
+ metric_to_extract: "embeddings" # Options: 'embeddings', 'logits'
48
+ output_document_information: true
49
+
50
+ # Cleaning Options
51
+ # --------------
52
+ cleaning_options:
53
+ general_cleaning: true # General cleaning applied to most datasets, check specifications in section "general_cleaning_options"
54
+ remove_punctuation: false
55
+ lowercase: true
56
+ remove_brackets_and_bracketcontent: false
57
+ remove_timestamps: false
58
+ timestamp_pattern_example: null # e.g., "#00:00:23-00#", only if remove_timestamps = True
59
+ # Fluency-specific options
60
+ fluency_task: *fluency_flag
61
+ word_splitter: ';'
62
+ remove_hyphens: true
63
+ remove_duplicates: true
64
+
65
+ general_cleaning_options:
66
+ strip_whitespace: true
67
+ merge_multiple_whitespaces: true
68
+ remove_whitespace_before_punctuation: true
69
+ merge_newline_characters: true
70
+ remove_backslashes: true
71
+
72
+ # Embedding Options
73
+ # ---------------
74
+ options_embeddings:
75
+ tokenization_method: "whitespace" # Options: 'whitespace', 'model'
76
+ model_name: "fastText" # Options: 'fastText', 'xlm-roberta-base'
77
+ pytorch_based_model: false
78
+ method: "model_instance"
79
+ max_length: 512
80
+ clean_embedding_tokens: true
81
+ remove_punctuation: false
82
+ lowercase: false
83
+ keep_speakertags: false
84
+ semantic-similarity: true
85
+ window_size: null
86
+ clean_tokens: true
87
+ divergence_from_optimality: false
88
+ output_options:
89
+ exclude_special_tokens: true
90
+ remove_'_'_character: true
91
+ remove_speaker_labels: true
92
+ remove_punctuation_and_symbols: true
93
+ remove_brackets_and_content: true
94
+
95
+ # Logits Options
96
+ # -------------
97
+ options_logits:
98
+ chunk_size: 128
99
+ overlap_size: 64
100
+ tokenization_method: "model"
101
+ model_name: "DiscoResearch/Llama3-German-8B-32k"
102
+ remove_punctuation: true
103
+ lowercase: true
104
+ keep_speakertags: true
105
+
106
+ # Analysis Options
107
+ # --------------
108
+ options_semantic-similarity:
109
+ window_sizes: # 'all' or window size as integer
110
+ - 2
111
+ - 8
112
+
113
+ options_dis_from_randomness:
114
+ window_size: 8
115
+ min_len: null
116
+ bootstrap: 10000
117
+ shuffle_mode: 'include0_includeN'
118
+ parallel_computing: false
119
+
120
+ # Normalization Options
121
+ # -------------------
122
+ normalization_options:
123
+ method: "lemmatization" # Options: 'lemmatization', 'stemming'
124
+
125
+ # Document Information Output
126
+ # -------------------------
127
+ document_information_output:
128
+ parameters:
129
+ - subject_ID
130
+ - fluency_word_count
131
+ - fluency_duplicate_count
132
+
133
+ # Filename Configuration
134
+ # --------------------
135
+ filename_components:
136
+ subject: true # mandatory
137
+ session: false
138
+ task: true # mandatory
139
+ task_addition: false
140
+ corpus: true # mandatory
141
+ metric: true
142
+ additional_tags: []
143
+
144
+ # Additional Settings
145
+ # -----------------
146
+ create_aggregation_of_results: true
@@ -0,0 +1 @@
1
+ __version__ = "0.3.3"
@@ -0,0 +1,34 @@
1
+ import os
2
+ from pathlib import Path
3
+ from pelican_nlp.main import Pelican
4
+ from pelican_nlp.config import RUN_TESTS, run_tests
5
+
6
+ def main():
7
+ # Run tests if enabled
8
+ if RUN_TESTS:
9
+ print("Running tests...")
10
+ run_tests()
11
+ return
12
+
13
+ # Get the package directory's sample_configuration_files folder
14
+ package_dir = Path(__file__).parent
15
+ config_dir = package_dir / 'sample_configuration_files'
16
+
17
+ if not config_dir.exists():
18
+ print("sample_configuration_files directory not found in package directory.")
19
+ return
20
+
21
+ config_files = [f for f in os.listdir(config_dir) if f.endswith(".yml")]
22
+ if not config_files:
23
+ print("No .yml configuration file found in the sample_configuration_files directory.")
24
+ return
25
+
26
+ if len(config_files) > 1:
27
+ print("More than one configuration file found in sample_configuration_files directory - please specify which one to use")
28
+ return
29
+
30
+ config_file = str(config_dir / config_files[0])
31
+ print(f"Using configuration file: {config_file}")
32
+
33
+ pelican = Pelican(config_file)
34
+ pelican.run()
@@ -0,0 +1,35 @@
1
+ """
2
+ Global configuration settings for the Pelican project.
3
+
4
+ This file is not the configuration.yml file created for the users adaptations.
5
+ For consistency of pipeline, DO NOT CHANGE.
6
+ """
7
+
8
+ # Debug flag
9
+ DEBUG_MODE = True
10
+
11
+ # Test flag - set to True to run all example tests
12
+ RUN_TESTS = False
13
+
14
+ def debug_print(*args, **kwargs):
15
+ """Print only if debug mode is enabled."""
16
+ DEBUG_MODE = True
17
+ if DEBUG_MODE:
18
+ print(*args, **kwargs)
19
+
20
+ def run_tests():
21
+ """Run all example tests if RUN_TESTS is enabled."""
22
+ if RUN_TESTS:
23
+ import unittest
24
+ from pathlib import Path
25
+
26
+ # Get the path to the test file
27
+ test_file = Path(__file__).parent / "utils" / "unittests" / "test_examples.py"
28
+
29
+ # Create a test suite and add the test file
30
+ loader = unittest.TestLoader()
31
+ suite = loader.discover(str(test_file.parent), pattern="test_examples.py")
32
+
33
+ # Run the tests
34
+ runner = unittest.TextTestRunner(verbosity=2)
35
+ runner.run(suite)
@@ -0,0 +1,106 @@
1
+ from pelican_nlp.extraction.language_model import Model
2
+ from pelican_nlp.preprocessing.text_tokenizer import TextTokenizer
3
+
4
+ from pelican_nlp.config import debug_print
5
+
6
+ class EmbeddingsExtractor:
7
+ def __init__(self, embeddings_configurations, project_path):
8
+ self.embeddings_configurations = embeddings_configurations
9
+ self.model_name = embeddings_configurations['model_name'] # Embedding model instance (e.g., fastText, RoBERTa)
10
+ self.model = Model(self.model_name, project_path)
11
+ self.Tokenizer = TextTokenizer(self.embeddings_configurations['tokenization_method'], self.model_name,
12
+ self.embeddings_configurations['max_length'])
13
+
14
+ self.model.load_model()
15
+ self.model_instance = self.model.model_instance
16
+
17
+ def extract_embeddings_from_text(self, text_list):
18
+
19
+ doc_entry_list = []
20
+
21
+ for text in text_list:
22
+
23
+ embeddings = {}
24
+
25
+ # Tokenize the input text
26
+ inputs = self.Tokenizer.tokenize_text(text)
27
+ debug_print(f'inputs are: {inputs}')
28
+
29
+ if self.embeddings_configurations['pytorch_based_model']:
30
+ #e.g. RoBERTa Model or Llama Model
31
+ import torch
32
+ with torch.no_grad():
33
+ if 'llama' in self.model_name.lower():
34
+ # Handle Llama models which expect input_ids directly
35
+ outputs = self.model_instance(input_ids=inputs['input_ids'])
36
+ else:
37
+ # Handle RoBERTa and other models that accept **inputs
38
+ if isinstance(inputs, dict):
39
+ # Ensure inputs are on the same device as the model
40
+ inputs = {k: v.to(self.model_instance.device) for k, v in inputs.items()}
41
+ debug_print(f"Model inputs: {inputs}")
42
+ outputs = self.model_instance(**inputs, output_hidden_states=True)
43
+ else:
44
+ debug_print(f"Input type: {type(inputs)}")
45
+ debug_print(f"Input content: {inputs}")
46
+
47
+ # If inputs is a list of strings, convert to token IDs first
48
+ if isinstance(inputs, list):
49
+ if isinstance(inputs[0], str):
50
+ # Convert tokens to IDs
51
+ token_ids = self.Tokenizer.tokenizer.convert_tokens_to_ids(inputs)
52
+ debug_print(f"Token IDs: {token_ids}")
53
+ inputs = torch.tensor([token_ids], device=self.model_instance.device)
54
+ else:
55
+ # If it's already a list of numbers, convert directly
56
+ inputs = torch.tensor([inputs], device=self.model_instance.device)
57
+ else:
58
+ # If it's already a tensor, just move to device
59
+ inputs = inputs.to(self.model_instance.device)
60
+
61
+ debug_print(f"Final tensor shape: {inputs.shape}")
62
+
63
+ # Ensure proper shape
64
+ if len(inputs.shape) == 1:
65
+ inputs = inputs.unsqueeze(0) # Add batch dimension
66
+
67
+ # Create attention mask
68
+ attention_mask = torch.ones_like(inputs)
69
+ debug_print(f"Model inputs - input_ids: {inputs.shape}, attention_mask: {attention_mask.shape}")
70
+ outputs = self.model_instance(input_ids=inputs, attention_mask=attention_mask, output_hidden_states=True)
71
+ debug_print(f"Model outputs type: {type(outputs)}")
72
+ debug_print(f"Model outputs attributes: {dir(outputs)}")
73
+
74
+ # Get word embeddings (last hidden state)
75
+ if outputs is None:
76
+ raise ValueError("Model returned None output")
77
+
78
+ if hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
79
+ word_embeddings = outputs.hidden_states[-1]
80
+ debug_print(f"Using hidden_states, shape: {word_embeddings.shape}")
81
+ elif hasattr(outputs, 'last_hidden_state'):
82
+ word_embeddings = outputs.last_hidden_state
83
+ debug_print(f"Using last_hidden_state, shape: {word_embeddings.shape}")
84
+ else:
85
+ raise ValueError(f"Model output has neither hidden_states nor last_hidden_state. Available attributes: {dir(outputs)}")
86
+
87
+ # Extract input_ids and convert them back to tokens
88
+ if isinstance(inputs, dict):
89
+ input_ids = inputs['input_ids'][0].tolist()
90
+ else:
91
+ input_ids = inputs[0].tolist()
92
+ tokens = self.Tokenizer.tokenizer.convert_ids_to_tokens(input_ids)
93
+
94
+ # Now align the tokens and embeddings
95
+ for token, embedding in zip(tokens, word_embeddings[0]):
96
+ embeddings[token]=embedding.tolist()
97
+
98
+ else:
99
+ if self.model_name == 'fastText':
100
+ embeddings = []
101
+ for token in inputs:
102
+ embeddings.append((token, self.model_instance.get_word_vector(token)))
103
+
104
+ doc_entry_list.append(embeddings)
105
+
106
+ return doc_entry_list, len(inputs)
@@ -22,7 +22,12 @@ class LogitsExtractor:
22
22
  chunk_size = self.options['chunk_size']
23
23
  overlap_size = self.options['overlap_size']
24
24
 
25
- input_ids = tokens.to(self.device)
25
+ # Convert list of token IDs to tensor if needed
26
+ if isinstance(tokens, list):
27
+ input_ids = torch.tensor([tokens], device=self.device)
28
+ else:
29
+ input_ids = tokens.to(self.device)
30
+
26
31
  chunks = self._split_into_chunks(input_ids, chunk_size, overlap_size)
27
32
 
28
33
  per_token_data = []
@@ -4,7 +4,7 @@ import os
4
4
  import shutil
5
5
 
6
6
  from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
7
- from transformers import AutoModelForCausalLM
7
+ from transformers import AutoModelForCausalLM, AutoModelForMaskedLM, AutoModel
8
8
 
9
9
  class Model:
10
10
  def __init__(self, model_name, project_path):
@@ -75,7 +75,6 @@ class Model:
75
75
 
76
76
  print(f'FastText model loaded successfully from {model_path}')
77
77
  elif self.model_name == 'xlm-roberta-base':
78
- from transformers import AutoModel
79
78
  self.model_instance = AutoModel.from_pretrained(
80
79
  self.model_name,
81
80
  trust_remote_code=trust_remote_code,
@@ -25,9 +25,10 @@ from pelican_nlp.utils.setup_functions import subject_instantiator, load_config,
25
25
  from pelican_nlp.preprocessing import LPDS
26
26
  from pelican_nlp.utils.filename_parser import parse_lpds_filename
27
27
 
28
- from pelican_nlp.config import debug_print
28
+ from pelican_nlp.config import debug_print, RUN_TESTS, run_tests
29
29
 
30
30
  project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_fluency/config_fluency.yml'
31
+ #project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_discourse/config_discourse.yml'
31
32
 
32
33
  class Pelican:
33
34
 
@@ -40,12 +41,12 @@ class Pelican:
40
41
  # If no config path is provided, use the default config from package; used for dev-mode
41
42
  if config_path is None:
42
43
  package_dir = Path(__file__).parent
43
- default_config = package_dir / 'configuration_files' / 'config_fluency.yml'
44
+ default_config = package_dir / 'sample_configuration_files' / 'config_fluency.yml'
44
45
  if default_config.exists():
45
46
  config_path = str(default_config)
46
47
  print(f"Using default configuration file: {config_path}")
47
48
  else:
48
- sys.exit('Error: Default configuration file not found in package.')
49
+ sys.exit('Error: Default configuration file not found in sample_configuration_files folder.')
49
50
 
50
51
  # Verify the provided path is a YAML file
51
52
  elif not config_path.endswith(('.yml', '.yaml')):
@@ -72,12 +73,6 @@ class Pelican:
72
73
  """Execute the main processing pipeline."""
73
74
  self._clear_gpu_memory()
74
75
 
75
- '''
76
- #run unittests in dev_mode; not yet implemented
77
- if self.dev_mode:
78
- self._run_tests()
79
- '''
80
-
81
76
  self._handle_output_directory()
82
77
 
83
78
  # Check/Create LPDS
@@ -229,4 +224,8 @@ class Pelican:
229
224
 
230
225
 
231
226
  if __name__ == '__main__':
232
- Pelican(project_path, dev_mode=True).run()
227
+ if RUN_TESTS:
228
+ print("Running tests...")
229
+ run_tests()
230
+ else:
231
+ Pelican(project_path, dev_mode=True).run()
@@ -24,7 +24,8 @@ class TextTokenizer:
24
24
  # Tokenize using the model's tokenizer
25
25
  return self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=self.max_sequence_length).to(self.device_used)
26
26
  elif method == 'model':
27
- return self.tokenizer.encode(text, return_tensors='pt')
27
+ # For model method, return token IDs directly
28
+ return self.tokenizer.encode(text, add_special_tokens=True)
28
29
  else:
29
30
  raise ValueError(f"Unsupported tokenization method: {method}")
30
31
 
@@ -34,10 +35,14 @@ class TextTokenizer:
34
35
  def get_tokenizer(self):
35
36
  if self.tokenization_method == 'model' or self.tokenization_method == 'model_roberta':
36
37
  from transformers import AutoTokenizer
38
+ if not self.model_name:
39
+ raise ValueError("model_name must be provided for model-based tokenization methods")
37
40
  return AutoTokenizer.from_pretrained(
38
41
  self.model_name,
39
42
  trust_remote_code=False, # Don't execute arbitrary model code
40
43
  use_safetensors=True
41
44
  )
42
- else:
45
+ elif self.tokenization_method == 'whitespace':
43
46
  return None
47
+ else:
48
+ raise ValueError(f"Unsupported tokenization method: {self.tokenization_method}")
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Graph Visualization for Pelican-nlp Project
4
+ ===========================================
5
+
6
+ This script creates a visual representation of the Pelican-nlp project structure
7
+ using graphviz.
8
+ """
9
+
10
+ from graphviz import Digraph
11
+
12
+ def create_pelican_graph():
13
+ # Create a new directed graph
14
+ dot = Digraph(comment='Pelican-nlp Project Structure')
15
+ dot.attr(rankdir='TB')
16
+
17
+ # Set node styles
18
+ dot.attr('node', shape='box', style='rounded,filled')
19
+
20
+ # Main Components
21
+ with dot.subgraph(name='cluster_main') as c:
22
+ c.attr(label='Main Components')
23
+ c.attr('node', fillcolor='lightblue')
24
+ c.node('Pelican', 'Pelican\n(Main Controller)')
25
+ c.node('LPDS', 'LPDS\n(Data Structure)')
26
+ c.node('Corpus', 'Corpus\n(Document Collection)')
27
+ c.node('Subject', 'Subject\n(Grouping Unit)')
28
+ c.node('Document', 'Document\n(Data Container)')
29
+ c.node('AudioDocument', 'AudioDocument\n(Audio Data)')
30
+
31
+ # Core Processing
32
+ with dot.subgraph(name='cluster_core') as c:
33
+ c.attr(label='Core Processing')
34
+ c.attr('node', fillcolor='lightgreen')
35
+ c.node('Config', 'Configuration\n(config.py)')
36
+ c.node('CLI', 'Command Line Interface\n(cli.py)')
37
+ c.node('Main', 'Main Entry Point\n(main.py)')
38
+
39
+ # Preprocessing Components
40
+ with dot.subgraph(name='cluster_preprocessing') as c:
41
+ c.attr(label='Preprocessing')
42
+ c.attr('node', fillcolor='lightyellow')
43
+ c.node('TextTokenizer', 'Text Tokenizer\n(text_tokenizer.py)')
44
+ c.node('TextNormalizer', 'Text Normalizer\n(text_normalizer.py)')
45
+ c.node('TextCleaner', 'Text Cleaner\n(text_cleaner.py)')
46
+ c.node('TextImporter', 'Text Importer\n(text_importer.py)')
47
+ c.node('SpeakerDiarization', 'Speaker Diarization\n(speaker_diarization.py)')
48
+ c.node('Pipeline', 'Preprocessing Pipeline\n(pipeline.py)')
49
+
50
+ # Extraction Components
51
+ with dot.subgraph(name='cluster_extraction') as c:
52
+ c.attr(label='Feature Extraction')
53
+ c.attr('node', fillcolor='lightpink')
54
+ c.node('LogitsExtractor', 'Logits Extractor\n(extract_logits.py)')
55
+ c.node('EmbeddingsExtractor', 'Embeddings Extractor\n(extract_embeddings.py)')
56
+ c.node('LanguageModel', 'Language Model\n(language_model.py)')
57
+ c.node('AcousticFeatures', 'Acoustic Features\n(acoustic_feature_extraction.py)')
58
+ c.node('SemanticSimilarity', 'Semantic Similarity\n(semantic_similarity.py)')
59
+ c.node('RandomnessDistance', 'Distance from Randomness\n(distance_from_randomness.py)')
60
+
61
+ # Utility Components
62
+ with dot.subgraph(name='cluster_utils') as c:
63
+ c.attr(label='Utilities')
64
+ c.attr('node', fillcolor='lightgrey')
65
+ c.node('FilenameParser', 'Filename Parser\n(filename_parser.py)')
66
+ c.node('CSVFunctions', 'CSV Functions\n(csv_functions.py)')
67
+ c.node('SetupFunctions', 'Setup Functions\n(setup_functions.py)')
68
+
69
+ # Main Relationships
70
+ dot.edge('Pelican', 'LPDS', 'manages')
71
+ dot.edge('Pelican', 'Corpus', 'processes')
72
+ dot.edge('Pelican', 'Subject', 'instantiates')
73
+ dot.edge('Corpus', 'Document', 'contains')
74
+ dot.edge('Subject', 'Document', 'groups')
75
+ dot.edge('Document', 'AudioDocument', 'extends')
76
+
77
+ # Core Processing Relationships
78
+ dot.edge('CLI', 'Main', 'calls')
79
+ dot.edge('Main', 'Pelican', 'instantiates')
80
+ dot.edge('Pelican', 'Config', 'uses')
81
+
82
+ # Preprocessing Relationships
83
+ dot.edge('Pipeline', 'TextTokenizer', 'uses')
84
+ dot.edge('Pipeline', 'TextNormalizer', 'uses')
85
+ dot.edge('Pipeline', 'TextCleaner', 'uses')
86
+ dot.edge('Pipeline', 'TextImporter', 'uses')
87
+ dot.edge('Pipeline', 'SpeakerDiarization', 'uses')
88
+ dot.edge('Corpus', 'Pipeline', 'executes')
89
+
90
+ # Extraction Relationships
91
+ dot.edge('Corpus', 'LogitsExtractor', 'uses')
92
+ dot.edge('Corpus', 'EmbeddingsExtractor', 'uses')
93
+ dot.edge('LogitsExtractor', 'LanguageModel', 'uses')
94
+ dot.edge('EmbeddingsExtractor', 'LanguageModel', 'uses')
95
+ dot.edge('Corpus', 'AcousticFeatures', 'uses')
96
+ dot.edge('Corpus', 'SemanticSimilarity', 'uses')
97
+ dot.edge('Corpus', 'RandomnessDistance', 'uses')
98
+
99
+ # Utility Relationships
100
+ dot.edge('Pelican', 'FilenameParser', 'uses')
101
+ dot.edge('Corpus', 'CSVFunctions', 'uses')
102
+ dot.edge('Pelican', 'SetupFunctions', 'uses')
103
+
104
+ # Save the graph
105
+ dot.render('pelican_structure_detailed', format='png', cleanup=True)
106
+ print("Detailed graph visualization has been created as 'pelican_structure_detailed.png'")
107
+
108
+ if __name__ == '__main__':
109
+ create_pelican_graph()