pelican-nlp 0.3.2__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. {pelican_nlp-0.3.2/pelican_nlp.egg-info → pelican_nlp-0.3.4}/PKG-INFO +1 -1
  2. {pelican_nlp-0.3.2/examples/PyPI_testing_discourse → pelican_nlp-0.3.4/examples/example_discourse}/config_discourse.yml +13 -9
  3. pelican_nlp-0.3.2/examples/PyPI_testing_discourse/subjects/sub-01/interview/sub-01_interview_schizophrenia_run-01.rtf → pelican_nlp-0.3.4/examples/example_discourse/subjects/sub-01/interview/sub-01_task-interview_acq-schizophrenia_run-01_transcript.rtf +20 -20
  4. {pelican_nlp-0.3.2/pelican_nlp/configuration_files → pelican_nlp-0.3.4/examples/example_fluency}/config_fluency.yml +2 -3
  5. pelican_nlp-0.3.4/examples/example_general/config_general.yml +146 -0
  6. pelican_nlp-0.3.4/pelican_nlp/_version.py +1 -0
  7. pelican_nlp-0.3.4/pelican_nlp/cli.py +36 -0
  8. pelican_nlp-0.3.4/pelican_nlp/config.py +35 -0
  9. pelican_nlp-0.3.4/pelican_nlp/extraction/extract_embeddings.py +106 -0
  10. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/extraction/extract_logits.py +6 -1
  11. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/extraction/language_model.py +1 -2
  12. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/main.py +9 -10
  13. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/preprocessing/text_tokenizer.py +7 -2
  14. pelican_nlp-0.3.4/pelican_nlp/project_graph/graph_visualization.py +109 -0
  15. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/sample_configuration_files/config_discourse.yml +14 -7
  16. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/sample_configuration_files/config_fluency.yml +2 -3
  17. pelican_nlp-0.3.4/pelican_nlp/sample_configuration_files/config_general.yml +146 -0
  18. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/utils/setup_functions.py +1 -1
  19. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_discourse/config_discourse.yml +109 -0
  20. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_discourse/subjects/sub-01/interview/sub-01_task-interview_acq-schizophrenia_run-01_transcript.rtf +40 -0
  21. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_fluency/config_fluency.yml +106 -0
  22. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_fluency/subjects/sub-01/fluency/sub-01_task-fluency_cat-semantic_acq-animals_text.txt +1 -0
  23. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_fluency/subjects/sub-01/fluency/sub-01_task-fluency_cat-semantic_acq-clothes_text.txt +1 -0
  24. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_fluency/subjects/sub-01/fluency/sub-01_task-fluency_cat-semantic_acq-food_text.txt +1 -0
  25. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_fluency/subjects/sub-02/fluency/sub-02_task-fluency_cat-semantic_acq-animals_text.txt +1 -0
  26. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_fluency/subjects/sub-02/fluency/sub-02_task-fluency_cat-semantic_acq-clothes_text.txt +1 -0
  27. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_fluency/subjects/sub-02/fluency/sub-02_task-fluency_cat-semantic_acq-food_text.txt +1 -0
  28. pelican_nlp-0.3.2/pelican_nlp/sample_configuration_files/config_general.yml → pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_image-descriptions/config_image-descriptions.yml +25 -20
  29. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_task-imgdesc_acq-drug_transcript.docx +0 -0
  30. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_task-imgdesc_acq-placebo_transcript.docx +0 -0
  31. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_task-imgdesc_acq-drug_transcript.docx +0 -0
  32. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_task-imgdesc_acq-placebo_transcript.docx +0 -0
  33. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_task-imgdesc_acq-drug_transcript.docx +0 -0
  34. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/examples/example_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_task-imgdesc_acq-placebo_transcript.docx +0 -0
  35. pelican_nlp-0.3.4/pelican_nlp/utils/unittests/test_examples.py +211 -0
  36. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4/pelican_nlp.egg-info}/PKG-INFO +1 -1
  37. pelican_nlp-0.3.4/pelican_nlp.egg-info/SOURCES.txt +121 -0
  38. pelican_nlp-0.3.2/pelican_nlp/_version.py +0 -1
  39. pelican_nlp-0.3.2/pelican_nlp/cli.py +0 -18
  40. pelican_nlp-0.3.2/pelican_nlp/config.py +0 -14
  41. pelican_nlp-0.3.2/pelican_nlp/extraction/extract_embeddings.py +0 -59
  42. pelican_nlp-0.3.2/pelican_nlp.egg-info/SOURCES.txt +0 -102
  43. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/LICENSE +0 -0
  44. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/MANIFEST.in +0 -0
  45. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/README.rst +0 -0
  46. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/docs/images/pelican_logo.png +0 -0
  47. /pelican_nlp-0.3.2/examples/PyPI_testing_fluency/subjects/sub-01/fluency/sub-01_fluency_sem_animals.txt → /pelican_nlp-0.3.4/examples/example_fluency/subjects/sub-01/fluency/sub-01_task-fluency_cat-semantic_acq-animals_text.txt +0 -0
  48. /pelican_nlp-0.3.2/examples/PyPI_testing_fluency/subjects/sub-01/fluency/sub-01_fluency_sem_clothes.txt → /pelican_nlp-0.3.4/examples/example_fluency/subjects/sub-01/fluency/sub-01_task-fluency_cat-semantic_acq-clothes_text.txt +0 -0
  49. /pelican_nlp-0.3.2/examples/PyPI_testing_fluency/subjects/sub-01/fluency/sub-01_fluency_sem_food.txt → /pelican_nlp-0.3.4/examples/example_fluency/subjects/sub-01/fluency/sub-01_task-fluency_cat-semantic_acq-food_text.txt +0 -0
  50. /pelican_nlp-0.3.2/examples/PyPI_testing_fluency/subjects/sub-02/fluency/sub-02_fluency_sem_animals.txt → /pelican_nlp-0.3.4/examples/example_fluency/subjects/sub-02/fluency/sub-02_task-fluency_cat-semantic_acq-animals_text.txt +0 -0
  51. /pelican_nlp-0.3.2/examples/PyPI_testing_fluency/subjects/sub-02/fluency/sub-02_fluency_sem_clothes.txt → /pelican_nlp-0.3.4/examples/example_fluency/subjects/sub-02/fluency/sub-02_task-fluency_cat-semantic_acq-clothes_text.txt +0 -0
  52. /pelican_nlp-0.3.2/examples/PyPI_testing_fluency/subjects/sub-02/fluency/sub-02_fluency_sem_food.txt → /pelican_nlp-0.3.4/examples/example_fluency/subjects/sub-02/fluency/sub-02_task-fluency_cat-semantic_acq-food_text.txt +0 -0
  53. {pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions → pelican_nlp-0.3.4/examples/example_image-descriptions}/config_image-descriptions.yml +0 -0
  54. /pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_image-description_drug.docx → /pelican_nlp-0.3.4/examples/example_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_task-imgdesc_acq-drug_transcript.docx +0 -0
  55. /pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_image-description_placebo.docx → /pelican_nlp-0.3.4/examples/example_image-descriptions/subjects/sub-01/ses-01/image-description/sub-01_ses-01_task-imgdesc_acq-placebo_transcript.docx +0 -0
  56. /pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_image-description_drug.docx → /pelican_nlp-0.3.4/examples/example_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_task-imgdesc_acq-drug_transcript.docx +0 -0
  57. /pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_image-description_placebo.docx → /pelican_nlp-0.3.4/examples/example_image-descriptions/subjects/sub-01/ses-02/image-description/sub-01_ses-02_task-imgdesc_acq-placebo_transcript.docx +0 -0
  58. /pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_image-description_drug.docx → /pelican_nlp-0.3.4/examples/example_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_task-imgdesc_acq-drug_transcript.docx +0 -0
  59. /pelican_nlp-0.3.2/examples/PyPI_testing_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_image-description_placebo.docx → /pelican_nlp-0.3.4/examples/example_image-descriptions/subjects/sub-02/ses-01/image-description/sub-02_ses-01_task-imgdesc_acq-placebo_transcript.docx +0 -0
  60. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/__init__.py +0 -0
  61. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/extract_acoustic_features.py +0 -0
  62. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/fluency/__init__.py +0 -0
  63. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/fluency/aggregate_fluency_results.py +0 -0
  64. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/fluency/behavioral_data.py +0 -0
  65. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/fluency/check_duplicates.py +0 -0
  66. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/fluency/coherence.py +0 -0
  67. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/fluency/config.py +0 -0
  68. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/fluency/main.py +0 -0
  69. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/fluency/optimality_without_tsa.py +0 -0
  70. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/fluency/plot_fluency.py +0 -0
  71. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/fluency/plotting_utils.py +0 -0
  72. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/fluency/questionnaires_data.py +0 -0
  73. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/fluency/stats_fluency.py +0 -0
  74. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/fluency/utils.py +0 -0
  75. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/speaker_diarization_Nils.py +0 -0
  76. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/transcription/__init__.py +0 -0
  77. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/transcription/annotation_tool.py +0 -0
  78. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/transcription/annotation_tool_boundaries.py +0 -0
  79. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/transcription/annotation_tool_sandbox.py +0 -0
  80. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/transcription/output/holmes_control_nova_all_outputs.json +0 -0
  81. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/transcription/test.json +0 -0
  82. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/transcription/transcribe_audio.py +0 -0
  83. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/transcription/transcribe_audio_chunked.py +0 -0
  84. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/transcription/transcription.py +0 -0
  85. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/transcription/transcription_gui.py +0 -0
  86. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Nils_backup/transcription/word_boundaries.py +0 -0
  87. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Silvia_files/Opensmile/opensmile_feature_extraction.py +0 -0
  88. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/Silvia_files/prosogram/prosogram.py +0 -0
  89. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/__init__.py +0 -0
  90. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/configuration_files/config_audio.yml +0 -0
  91. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/configuration_files/config_discourse.yml +0 -0
  92. {pelican_nlp-0.3.2/examples/PyPI_testing_fluency → pelican_nlp-0.3.4/pelican_nlp/configuration_files}/config_fluency.yml +0 -0
  93. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/configuration_files/config_general.yml +0 -0
  94. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/configuration_files/config_morteza.yml +0 -0
  95. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/core/__init__.py +0 -0
  96. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/core/audio_document.py +0 -0
  97. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/core/corpus.py +0 -0
  98. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/core/document.py +0 -0
  99. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/core/subject.py +0 -0
  100. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/extraction/__init__.py +0 -0
  101. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/extraction/acoustic_feature_extraction.py +0 -0
  102. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/extraction/distance_from_randomness.py +0 -0
  103. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/extraction/semantic_similarity.py +0 -0
  104. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/extraction/test_documents/test_features.csv +0 -0
  105. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/extraction/test_documents/wallace_1.15_3.txt +0 -0
  106. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/extraction/test_documents/wallace_1.1_3.txt +0 -0
  107. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/extraction/test_documents/wallace_1_4.txt +0 -0
  108. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/metrics_statistics/embeddings_metrics_statistics.py +0 -0
  109. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/praat/__init__.py +0 -0
  110. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/preprocessing/LPDS.py +0 -0
  111. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/preprocessing/__init__.py +0 -0
  112. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/preprocessing/pipeline.py +0 -0
  113. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/preprocessing/speaker_diarization.py +0 -0
  114. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/preprocessing/text_cleaner.py +0 -0
  115. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/preprocessing/text_importer.py +0 -0
  116. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/preprocessing/text_normalizer.py +0 -0
  117. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/utils/__init__.py +0 -0
  118. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/utils/csv_functions.py +0 -0
  119. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/utils/filename_parser.py +0 -0
  120. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/utils/sample_usage.py +0 -0
  121. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp.egg-info/dependency_links.txt +0 -0
  122. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp.egg-info/entry_points.txt +0 -0
  123. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp.egg-info/requires.txt +0 -0
  124. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp.egg-info/top_level.txt +0 -0
  125. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pyproject.toml +0 -0
  126. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/requirements.txt +0 -0
  127. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/setup.cfg +0 -0
  128. {pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/tests/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pelican_nlp
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary: Preprocessing and Extraction of Linguistic Information for Computational Analysis
5
5
  Author-email: Yves Pauli <yves.pauli@gmail.com>
6
6
  License-Expression: CC-BY-NC-4.0
@@ -7,11 +7,15 @@ discourse: &discourse_flag true
7
7
  #general configurations; always adapt
8
8
  language: "german" # Possibly add options for German and English
9
9
 
10
- task_name: "interview" # Give name of task used for creation of the input file (e.g., ['fluency', 'interview'])
11
- corpus_names:
10
+ task_name: "interview"
11
+
12
+ #Create analysis corpus, group files based on corpus entity.
13
+ corpus_key: "acq"
14
+ corpus_values: #group names
15
+ - "placebo"
12
16
  - "schizophrenia"
13
17
 
14
- metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
18
+ metric_to_extract: "logits" #Possible options: 'logits' or 'embeddings'
15
19
 
16
20
  number_of_speakers: 3
17
21
  subject_speakertag: "B"
@@ -43,7 +47,7 @@ options_logits:
43
47
  keep_speakertags: true
44
48
 
45
49
  options_embeddings:
46
- tokenization_method: "model_roberta" #or "whitespace", "model"
50
+ tokenization_method: "model" #"model" or "whitespace"
47
51
  max_length: 512 #max sequence length
48
52
  model_name: "xlm-roberta-base" #e.g. "fastText", "xlm-roberta-base"
49
53
  pytorch_based_model: true
@@ -59,10 +63,10 @@ options_embeddings:
59
63
  remove_punctuation_and_symbols: true
60
64
  remove_brackets_and_content: true
61
65
  semantic-similarity: false
66
+ distance-from-randomness: false
62
67
  window_size: null
63
68
  clean_tokens: false
64
-
65
- distance-from-randomness: false
69
+ divergence_from_optimality: false
66
70
  #================================================================================
67
71
 
68
72
  #Extra configurations:
@@ -93,13 +97,13 @@ normalization_options:
93
97
  method: "lemmatization" #Options: lemmatization or stemming
94
98
  #================================================================
95
99
 
100
+ create_aggregation_of_results: false
101
+ output_document_information: false
102
+
96
103
  #Detail configurations; Changes optional, mostly used for quality checking / error handling
97
104
  number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
98
105
  multiple_sessions: false # Set to True if multiple sessions per subject
99
106
 
100
107
  recompute_everything: true #If set to 'false' pelican-nlp will try to reuse previously computed results stored on your drive
101
108
 
102
- create_aggregation_of_results: false
103
- output_document_information: false
104
-
105
109
 
@@ -1,40 +1,40 @@
1
1
  {\rtf1\ansi\deff3\adeflang1025
2
- {\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\froman\fprq2\fcharset0 Arial;}{\f5\froman\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f6\fnil\fprq2\fcharset0 Noto Sans CJK SC;}{\f7\fnil\fprq2\fcharset0 0;}{\f8\fnil\fprq2\fcharset0 Noto Sans Devanagari;}}
2
+ {\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\froman\fprq2\fcharset0 Arial;}{\f5\froman\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f6\fnil\fprq2\fcharset0 0;}{\f7\fnil\fprq2\fcharset0 Noto Sans CJK SC;}{\f8\fnil\fprq2\fcharset0 Noto Sans Devanagari;}}
3
3
  {\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
4
- {\stylesheet{\s0\snext0\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052 Normal;}
4
+ {\stylesheet{\s0\snext0\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052 Normal;}
5
5
  {\*\cs15\snext15 Footnote Characters;}
6
6
  {\*\cs16\snext16\rtlch\ab \ltrch\loch\b Strong;}
7
- {\s17\sbasedon0\snext18\rtlch\af8\afs28\alang1081 \ltrch\lang1033\langfe2052\hich\af5\loch\ql\nowidctlpar\hyphpar1\sb240\sa120\keepn\ltrpar\cf0\f5\fs28\lang1033\kerning1\dbch\af6\langfe2052 Heading;}
8
- {\s18\sbasedon0\snext18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052 Body Text;}
9
- {\s19\sbasedon18\snext19\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052 List;}
10
- {\s20\sbasedon0\snext20\rtlch\af8\afs24\alang1081\ai \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\sb120\sa120\ltrpar\cf0\f3\fs24\lang1033\i\kerning1\dbch\af7\langfe2052 Caption;}
11
- {\s21\sbasedon0\snext21\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052 Index;}
7
+ {\s17\sbasedon0\snext18\rtlch\af8\afs28\alang1081 \ltrch\lang1033\langfe2052\hich\af5\loch\ql\nowidctlpar\hyphpar1\sb240\sa120\keepn\ltrpar\cf0\f5\fs28\lang1033\kerning1\dbch\af7\langfe2052 Heading;}
8
+ {\s18\sbasedon0\snext18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052 Body Text;}
9
+ {\s19\sbasedon18\snext19\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052 List;}
10
+ {\s20\sbasedon0\snext20\rtlch\af8\afs24\alang1081\ai \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\sb120\sa120\ltrpar\cf0\f3\fs24\lang1033\i\kerning1\dbch\af6\langfe2052 Caption;}
11
+ {\s21\sbasedon0\snext21\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052 Index;}
12
12
  }{\*\generator LibreOffice/24.2.7.2$Linux_X86_64 LibreOffice_project/420$Build-2}{\info{\title 648866ebdbd870441d179a92}{\author Thomas Luthi-Bhatti}{\creatim\yr2023\mo6\dy14\hr17\min6}{\revtim\yr2025\mo4\dy8\hr13\min51}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops{\propname Operator}\proptype30{\staticval Ulrike Rachner}}\deftab720
13
13
  \hyphauto1\viewscale100\formshade\paperh16838\paperw11906\margl1417\margr1417\margt1417\margb1398\sectd\sbknone\sftnnar\saftnnrlc\sectunlocked1\pgwsxn11906\pghsxn16838\marglsxn1417\margrsxn1417\margtsxn1417\margbsxn1398\ftnbj\ftnstart1\ftnrestart\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
14
- {\*\ftnsep\chftnsep}\pgndec\pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052\sl100\slmult0\qc\hyphpar0\fi0\li0\lin0\ri0\rin0\sb238\sa0{\hich\af4\loch\cs16\rtlch\ab \ltrch\loch\b\fs22\lang1031\f4\loch
14
+ {\*\ftnsep\chftnsep}\pgndec\pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl100\slmult0\qc\hyphpar0\fi0\li0\lin0\ri0\rin0\sb238\sa0{\hich\af4\loch\cs16\rtlch\ab \ltrch\loch\b\fs22\lang1031\f4\loch
15
15
  Interview with Interviewee}
16
- \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052\sl100\slmult0\qc\hyphpar0\fi0\li0\lin0\ri0\rin0\sb238\sa0\loch
16
+ \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl100\slmult0\qc\hyphpar0\fi0\li0\lin0\ri0\rin0\sb238\sa0\loch
17
17
 
18
- \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
18
+ \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
19
19
  I: Das ist f\u252\'fcr mich. Ich m\u246\'f6chte, dass Sie \u252\'fcber ein paar Dinge aus Ihrem t\u228\'e4glichen Leben sprechen. Sie m\u252\'fcssen (keinerlei?) Namen nennen, w\u228\'e4hrend Sie dieses Ereignis beschreiben. K\u246\'f6nnen Sie mir ein wenig \u252\'fcber sich erz\u228\'e4hlen? #00:00:14-00#\line B: (In Schriftsprache.) Ja, nat\u252\'fcrlich. Jeden Morgen beginne ich den Tag mit einer Tasse Tee. Ich bin jemand, der viel Wert auf eine ruhige Morgenroutine legt. Es ist f\u252\'fcr mich sehr wichtig, dass der Start in den Tag entspannt und nicht hektisch ist. Oft lese ich auch ein paar Seiten in einem Buch, das ich gerade lese. Danach gehe ich meistens zur Arbeit, entweder ins B\u252\'fcro oder arbeite von zu Hause aus. Mein Job ist sehr abwechslungsreich, und es gef\u228\'e4llt mir, immer neue Herausforderungen zu haben. Am Nachmittag gehe ich oft spazieren oder treffe mich mit Freunden. Ein gutes Gespr\u228\'e4ch oder eine kleine Wanderung in der Natur tut mir immer sehr gut. Am Abend koche ich gerne etwas Leckeres und entspanne mich beim Fernsehen oder h\u246\'f6re Musik. #00:00:51-00#}
20
- \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
20
+ \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
21
21
  I: Wenn Sie zur\u252\'fcckdenken, k\u246\'f6nnen Sie mir eine Geschichte \u252\'fcber etwas Wichtiges erz\u228\'e4hlen, das in Ihrem Leben passiert ist? Die Geschichte kann aus einer beliebigen Zeit Ihres Lebens stammen, aus Ihrer Kindheit oder auch vor Kurzem. Sie brauchen keine Namen zu nennen, wenn Sie dieses Ereignis beschreiben. #00:04:19-00#\line B: Ich erinnere mich an eine Zeit, als ich mit meiner Familie in einem kleinen Dorf auf einem Berg war. Es war ein Winterwochenende, und wir hatten viel Schnee. An diesem Tag sind wir alle zusammen mit Schlitten den Hang hinuntergefahren. Es war eine sehr lustige Erfahrung, weil wir alle wie Kinder waren, trotz des Alters. Aber was mir wirklich in Erinnerung geblieben ist, war, dass ich mich nach diesem Tag viel n\u228\'e4her mit meiner Familie verbunden f\u252\'fchlte. Es war ein Moment, in dem wir uns alle unterst\u252\'fctzt und gemeinsam gelacht haben, was damals sehr wichtig f\u252\'fcr mich war. Diese Momente mit der Familie sind f\u252\'fcr mich unersetzlich. #00:05:42-00#}
22
- \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
22
+ \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
23
23
  I: Ich m\u246\'f6chte Sie nun bitten, dass Sie ein wenig \u252\'fcber Ihre Gesundheit sprechen. Sie brauchen keine Namen zu nennen. Glauben Sie, dass Sie eine psychische Krankheit haben? Und wenn ja, worum handelt es sich Ihrer Meinung nach? #00:06:03-00#\line B: In letzter Zeit habe ich mich mehr mit meiner mentalen Gesundheit besch\u228\'e4ftigt. Ich w\u252\'fcrde sagen, dass ich in einer stabilen psychischen Verfassung bin, aber es gibt Momente, in denen ich mich \u252\'fcberfordert f\u252\'fchle. Gerade in stressigen Phasen merke ich, dass es schwieriger f\u252\'fcr mich ist, den Kopf klar zu behalten. Aber ich versuche, mir Hilfe zu suchen und achte sehr darauf, auf mich selbst zu h\u246\'f6ren. Es gibt Phasen, in denen ich das Gef\u252\'fchl habe, dass ich eine kurze Auszeit brauche, um mich wieder zu sortieren. Aber insgesamt denke ich, dass ich psychisch gesund bin, solange ich mir genug Zeit f\u252\'fcr mich nehme. #00:06:48-00#}
24
- \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
24
+ \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
25
25
  I: Und wurde Ihnen eine Diagnose gestellt? #00:07:03-00#\line B: Nein, bisher nicht. #00:07:04-00#}
26
- \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
26
+ \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
27
27
  I: Danke. Okay. Ich werde Ihnen jetzt drei Bilder zeigen, und eins nach dem anderen. Jedes Mal, wenn ich das Bild vor Sie lege, m\u246\'f6chte ich Sie bitten, das Bild so vollst\u228\'e4ndig wie m\u246\'f6glich zu beschreiben. Sagen Sie mir, was Sie auf dem Bild sehen und was Ihrer Meinung nach passieren k\u246\'f6nnte. Bitte sprechen Sie, bis ich Stopp sage. (...) Bild Nummer zwei. Bitte sagen Sie, was Sie auf diesem Bild sehen. #00:09:10-00#\line B: (Startet in Schriftsprache.) Auf diesem Bild sieht man eine Gruppe von Personen, die auf einem Markt stehen. Es ist ein lebhafter Ort, mit vielen St\u228\'e4nden und bunten Waren. In der Mitte sieht man eine \u228\'e4ltere Frau, die gerade eine Melone ausw\u228\'e4hlt. Sie tr\u228\'e4gt eine einfache, aber stilvolle Kleidung. Links sieht man einen jungen Mann, der mit einem Verk\u228\'e4ufer spricht, der gerade Tomaten in eine T\u252\'fcte packt. Im Hintergrund sieht man weitere Marktst\u228\'e4nde, die mit Obst und Gem\u252\'fcse voll sind. Der Himmel ist bew\u246\'f6lkt, und es sieht aus, als w\u252\'fcrde es bald regnen. Es scheint ein sch\u246\'f6ner, aber auch sehr besch\u228\'e4ftigter Tag zu sein. #00:10:37-00#}
28
- \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
28
+ \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
29
29
  I: Danke sch\u246\'f6n. Bild Nummer vier. Was passiert auf diesem Bild? Oder was sehen Sie auf diesem Bild? #00:10:46-00#\line B: Auf diesem Bild sieht man einen Mann und eine Frau, die zusammen auf einer Bank sitzen. Der Mann ist in einem Anzug und schaut auf sein Handy. Die Frau tr\u228\'e4gt ein sommerliches Kleid und schaut nachdenklich in die Ferne. Sie scheint in einer anderen Welt zu sein, w\u228\'e4hrend der Mann abgelenkt ist. Im Hintergrund ist ein Park zu sehen, mit B\u228\'e4umen und einem kleinen See. Die Stimmung wirkt ein bisschen melancholisch, als ob beide Menschen in Gedanken versunken sind. Es scheint, als ob sie ein Gespr\u228\'e4ch f\u252\'fchren, aber jeder ist in seiner eigenen Welt. #00:12:00-00#}
30
- \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
30
+ \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
31
31
  I: Danke sch\u246\'f6n. Und Bild Nummer 17GF, was sehen Sie auf diesem Bild? #00:12:09-00#\line B: Auf diesem Bild sieht man einen alten Leuchtturm, der auf einem Felsen \u252\'fcber dem Meer thront. Der Himmel ist dramatisch, mit dunklen Wolken und einer Art Sturmstimmung. Das Meer ist unruhig und st\u252\'fcrmisch, und man sieht die Wellen gegen den Felsen schlagen. In der N\u228\'e4he des Leuchtturms ist ein kleiner, alter Kutter zu sehen, der versucht, gegen die Wellen anzukommen. Es wirkt wie eine dramatische Szene, bei der der Leuchtturm als Rettungsanker in dieser st\u252\'fcrmischen See dient. Der Leuchtturm strahlt ein warmes Licht aus, das den Kutter zu f\u252\'fchren scheint. #00:13:23-00#}
32
- \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
32
+ \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
33
33
  I: Danke. Gut, ich werde Ihnen nun einige Bilder aus einer Geschichte zeigen. Sie k\u246\'f6nnen sich so viel Zeit nehmen, wie Sie brauchen, um die Bilder anzuschauen. Nachdem Sie alle Bilder der Reihe nach angesehen haben, m\u246\'f6chte ich Sie bitten, mir die Geschichten auf den Bildern in Ihren eigenen Worten zu erz\u228\'e4hlen. Das ist die Geschichte. #00:13:47-00#\line B: Ich habe in der Kindheit oft getr\u228\'e4umt, dass ich in einem Wald unterwegs war. #00:15:59-00#}
34
- \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
34
+ \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
35
35
  I: Im Wald? #00:15:56-00#\line B: Ja, genau. Ich war als Kind oft drau\u223\'dfen in den W\u228\'e4ldern, und in meinen Tr\u228\'e4umen bin ich immer tiefer in den Wald gegangen. Eines Tages kam ich an einen kleinen Bach, der durch den Wald floss. Der Bach war klar und das Wasser funkelte im Sonnenlicht. Ich sa\u223\'df dort und beobachtete die Fische, die durch das Wasser schwammen. Es war sehr ruhig, und ich f\u252\'fchlte mich v\u246\'f6llig friedlich. In diesem Moment hatte ich das Gef\u252\'fchl, dass ich ein Teil der Natur war und mit der Welt um mich herum eins. Es war ein sch\u246\'f6ner, friedlicher Traum, der mir auch als Erwachsener oft in den Sinn kommt. #00:16:44-00#}
36
- \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
36
+ \par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
37
37
  I: Gut. Jetzt haben Sie eine Minute Zeit, um den Text durchzugehen, danach bitte ich Sie, das Blatt wegzulegen und mir die Geschichte in eigenen Worten zu erz\u228\'e4hlen. #00:17:51-00#\line B: Okay. #00:17:52-00#\line (Stille. B liest. #00:17:52-00# - #00:19:13-00#)\line B: (in Schriftsprache.) Die Geschichte handelt von einer kleinen Katze, die an einem sehr hei\u223\'dfen Tag im Schatten eines Baumes schl\u228\'e4ft. Sie tr\u228\'e4umt von einem k\u252\'fchlen Teich, an dem sie trinken kann. Als sie aufwacht, ist der Teich nicht mehr weit, und die Katze folgt einem Schmetterling, der sie zu einem geheimen, versteckten Ort f\u252\'fchrt. Der Teich ist klar, und die Katze kann endlich ihren Durst l\u246\'f6schen. Sie ist sehr zufrieden und kehrt sp\u228\'e4ter zur\u252\'fcck zu ihrem Baum, um sich wieder auszuruhen. #00:20:09-00#\line I: Danke sch\u246\'f6n.}
38
- \par \pard\plain \s0\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052\sl100\slmult0\qc\hyphpar0\fi0\li0\lin0\ri0\rin0\sb238\sa0\loch
38
+ \par \pard\plain \s0\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl100\slmult0\qc\hyphpar0\fi0\li0\lin0\ri0\rin0\sb238\sa0\loch
39
39
 
40
40
  \par }
@@ -8,7 +8,8 @@ fluency_task: &fluency_flag true
8
8
  language: "german"
9
9
  multiple_sessions: &session_flag false
10
10
 
11
- corpus_names: #names of fluency tasks (e.g. "animals", "clothes")
11
+ corpus_key: "acq"
12
+ corpus_values: #names of fluency tasks (e.g. "animals", "clothes")
12
13
  - "animals"
13
14
  - "clothes"
14
15
  - "food"
@@ -103,5 +104,3 @@ filename_components:
103
104
  metric: true
104
105
  additional_tags: []
105
106
 
106
-
107
-
@@ -0,0 +1,146 @@
1
+ # Master Configuration File
2
+ # ========================
3
+
4
+ # Basic Settings
5
+ # -------------
6
+ input_file: "text" # Options: 'text' or 'audio'
7
+ language: "german"
8
+ recompute_everything: true # If false will give warning if output folder already exists
9
+
10
+ # Task Configuration
11
+ # -----------------
12
+ task_name: null # Name of task used for creation of data
13
+ fluency_task: &fluency_flag false # Flag for fluency-specific settings
14
+ discourse: &discourse_flag false # Flag for discourse-specific settings
15
+
16
+ # Corpus Configuration
17
+ # ------------------
18
+ corpus_key: null # Entity key to group files for analysis
19
+ corpus_values: # Corresponding entity values found in dataset
20
+ - "healthy-control"
21
+ - "placebo"
22
+
23
+ # Session and Subject Settings
24
+ # --------------------------
25
+ multiple_sessions: false
26
+ number_of_subjects: null # If null, auto-detected
27
+ number_of_speakers: 1 # Specify amount of speakers for discourse files
28
+ subject_speakertag: null # Speaker tag for subject (e.g., "B"), only for discourse
29
+
30
+ # Document Structure
31
+ # ----------------
32
+ has_multiple_sections: false
33
+ has_section_titles: false
34
+ section_identification: null # e.g., "Section:", in case of multiple sections
35
+ number_of_sections: null # If null, auto-detected, specify for multiple sections to check section detection
36
+
37
+ # Processing Pipeline
38
+ # -----------------
39
+ pipeline_options: # Just for data preprocessing without metric extraction
40
+ quality_check: false
41
+ clean_text: true
42
+ tokenize_text: false
43
+ normalize_text: false
44
+
45
+ # Metric Extraction
46
+ # ---------------
47
+ metric_to_extract: "embeddings" # Options: 'embeddings', 'logits'
48
+ output_document_information: true
49
+
50
+ # Cleaning Options
51
+ # --------------
52
+ cleaning_options:
53
+ general_cleaning: true # General cleaning applied to most datasets, check specifications in section "general_cleaning_options"
54
+ remove_punctuation: false
55
+ lowercase: true
56
+ remove_brackets_and_bracketcontent: false
57
+ remove_timestamps: false
58
+ timestamp_pattern_example: null # e.g., "#00:00:23-00#", only if remove_timestamps = True
59
+ # Fluency-specific options
60
+ fluency_task: *fluency_flag
61
+ word_splitter: ';'
62
+ remove_hyphens: true
63
+ remove_duplicates: true
64
+
65
+ general_cleaning_options:
66
+ strip_whitespace: true
67
+ merge_multiple_whitespaces: true
68
+ remove_whitespace_before_punctuation: true
69
+ merge_newline_characters: true
70
+ remove_backslashes: true
71
+
72
+ # Embedding Options
73
+ # ---------------
74
+ options_embeddings:
75
+ tokenization_method: "whitespace" # Options: 'whitespace', 'model'
76
+ model_name: "fastText" # Options: 'fastText', 'xlm-roberta-base'
77
+ pytorch_based_model: false
78
+ method: "model_instance"
79
+ max_length: 512
80
+ clean_embedding_tokens: true
81
+ remove_punctuation: false
82
+ lowercase: false
83
+ keep_speakertags: false
84
+ semantic-similarity: true
85
+ window_size: null
86
+ clean_tokens: true
87
+ divergence_from_optimality: false
88
+ output_options:
89
+ exclude_special_tokens: true
90
+ remove_'_'_character: true
91
+ remove_speaker_labels: true
92
+ remove_punctuation_and_symbols: true
93
+ remove_brackets_and_content: true
94
+
95
+ # Logits Options
96
+ # -------------
97
+ options_logits:
98
+ chunk_size: 128
99
+ overlap_size: 64
100
+ tokenization_method: "model"
101
+ model_name: "DiscoResearch/Llama3-German-8B-32k"
102
+ remove_punctuation: true
103
+ lowercase: true
104
+ keep_speakertags: true
105
+
106
+ # Analysis Options
107
+ # --------------
108
+ options_semantic-similarity:
109
+ window_sizes: # 'all' or window size as integer
110
+ - 2
111
+ - 8
112
+
113
+ options_dis_from_randomness:
114
+ window_size: 8
115
+ min_len: null
116
+ bootstrap: 10000
117
+ shuffle_mode: 'include0_includeN'
118
+ parallel_computing: false
119
+
120
+ # Normalization Options
121
+ # -------------------
122
+ normalization_options:
123
+ method: "lemmatization" # Options: 'lemmatization', 'stemming'
124
+
125
+ # Document Information Output
126
+ # -------------------------
127
+ document_information_output:
128
+ parameters:
129
+ - subject_ID
130
+ - fluency_word_count
131
+ - fluency_duplicate_count
132
+
133
+ # Filename Configuration
134
+ # --------------------
135
+ filename_components:
136
+ subject: true # mandatory
137
+ session: false
138
+ task: true # mandatory
139
+ task_addition: false
140
+ corpus: true # mandatory
141
+ metric: true
142
+ additional_tags: []
143
+
144
+ # Additional Settings
145
+ # -----------------
146
+ create_aggregation_of_results: true
@@ -0,0 +1 @@
1
+ __version__ = "0.3.4"
@@ -0,0 +1,36 @@
1
+ import os
2
+ from pathlib import Path
3
+ from pelican_nlp.main import Pelican
4
+ from pelican_nlp.config import RUN_TESTS, run_tests
5
+
6
+ def main():
7
+ # Run tests if enabled
8
+ if RUN_TESTS:
9
+ print("Running tests...")
10
+ run_tests()
11
+ return
12
+
13
+ # Look for configuration files in the current working directory
14
+ config_dir = Path.cwd()
15
+
16
+ print(f"Looking for configuration files in: {config_dir}")
17
+
18
+ config_files = [f for f in os.listdir(config_dir) if f.endswith((".yml", ".yaml"))]
19
+
20
+ if not config_files:
21
+ print("No .yml or .yaml configuration file found in the current directory.")
22
+ print("Please ensure you have a configuration file in your current working directory.")
23
+ return
24
+
25
+ if len(config_files) > 1:
26
+ print("Warning: Multiple configuration files found in current directory:")
27
+ for i, file in enumerate(config_files, 1):
28
+ print(f" {i}. {file}")
29
+ print("Please ensure only one configuration file is present in the current directory.")
30
+ return
31
+
32
+ config_file = str(config_dir / config_files[0])
33
+ print(f"Using configuration file: {config_file}")
34
+
35
+ pelican = Pelican(config_file)
36
+ pelican.run()
@@ -0,0 +1,35 @@
1
+ """
2
+ Global configuration settings for the Pelican project.
3
+
4
+ This file is not the configuration.yml file created for the users adaptations.
5
+ For consistency of pipeline, DO NOT CHANGE.
6
+ """
7
+
8
+ # Debug flag
9
+ DEBUG_MODE = True
10
+
11
+ # Test flag - set to True to run all example tests
12
+ RUN_TESTS = False
13
+
14
+ def debug_print(*args, **kwargs):
15
+ """Print only if debug mode is enabled."""
16
+ DEBUG_MODE = True
17
+ if DEBUG_MODE:
18
+ print(*args, **kwargs)
19
+
20
+ def run_tests():
21
+ """Run all example tests if RUN_TESTS is enabled."""
22
+ if RUN_TESTS:
23
+ import unittest
24
+ from pathlib import Path
25
+
26
+ # Get the path to the test file
27
+ test_file = Path(__file__).parent / "utils" / "unittests" / "test_examples.py"
28
+
29
+ # Create a test suite and add the test file
30
+ loader = unittest.TestLoader()
31
+ suite = loader.discover(str(test_file.parent), pattern="test_examples.py")
32
+
33
+ # Run the tests
34
+ runner = unittest.TextTestRunner(verbosity=2)
35
+ runner.run(suite)
@@ -0,0 +1,106 @@
1
+ from pelican_nlp.extraction.language_model import Model
2
+ from pelican_nlp.preprocessing.text_tokenizer import TextTokenizer
3
+
4
+ from pelican_nlp.config import debug_print
5
+
6
+ class EmbeddingsExtractor:
7
+ def __init__(self, embeddings_configurations, project_path):
8
+ self.embeddings_configurations = embeddings_configurations
9
+ self.model_name = embeddings_configurations['model_name'] # Embedding model instance (e.g., fastText, RoBERTa)
10
+ self.model = Model(self.model_name, project_path)
11
+ self.Tokenizer = TextTokenizer(self.embeddings_configurations['tokenization_method'], self.model_name,
12
+ self.embeddings_configurations['max_length'])
13
+
14
+ self.model.load_model()
15
+ self.model_instance = self.model.model_instance
16
+
17
+ def extract_embeddings_from_text(self, text_list):
18
+
19
+ doc_entry_list = []
20
+
21
+ for text in text_list:
22
+
23
+ embeddings = {}
24
+
25
+ # Tokenize the input text
26
+ inputs = self.Tokenizer.tokenize_text(text)
27
+ debug_print(f'inputs are: {inputs}')
28
+
29
+ if self.embeddings_configurations['pytorch_based_model']:
30
+ #e.g. RoBERTa Model or Llama Model
31
+ import torch
32
+ with torch.no_grad():
33
+ if 'llama' in self.model_name.lower():
34
+ # Handle Llama models which expect input_ids directly
35
+ outputs = self.model_instance(input_ids=inputs['input_ids'])
36
+ else:
37
+ # Handle RoBERTa and other models that accept **inputs
38
+ if isinstance(inputs, dict):
39
+ # Ensure inputs are on the same device as the model
40
+ inputs = {k: v.to(self.model_instance.device) for k, v in inputs.items()}
41
+ debug_print(f"Model inputs: {inputs}")
42
+ outputs = self.model_instance(**inputs, output_hidden_states=True)
43
+ else:
44
+ debug_print(f"Input type: {type(inputs)}")
45
+ debug_print(f"Input content: {inputs}")
46
+
47
+ # If inputs is a list of strings, convert to token IDs first
48
+ if isinstance(inputs, list):
49
+ if isinstance(inputs[0], str):
50
+ # Convert tokens to IDs
51
+ token_ids = self.Tokenizer.tokenizer.convert_tokens_to_ids(inputs)
52
+ debug_print(f"Token IDs: {token_ids}")
53
+ inputs = torch.tensor([token_ids], device=self.model_instance.device)
54
+ else:
55
+ # If it's already a list of numbers, convert directly
56
+ inputs = torch.tensor([inputs], device=self.model_instance.device)
57
+ else:
58
+ # If it's already a tensor, just move to device
59
+ inputs = inputs.to(self.model_instance.device)
60
+
61
+ debug_print(f"Final tensor shape: {inputs.shape}")
62
+
63
+ # Ensure proper shape
64
+ if len(inputs.shape) == 1:
65
+ inputs = inputs.unsqueeze(0) # Add batch dimension
66
+
67
+ # Create attention mask
68
+ attention_mask = torch.ones_like(inputs)
69
+ debug_print(f"Model inputs - input_ids: {inputs.shape}, attention_mask: {attention_mask.shape}")
70
+ outputs = self.model_instance(input_ids=inputs, attention_mask=attention_mask, output_hidden_states=True)
71
+ debug_print(f"Model outputs type: {type(outputs)}")
72
+ debug_print(f"Model outputs attributes: {dir(outputs)}")
73
+
74
+ # Get word embeddings (last hidden state)
75
+ if outputs is None:
76
+ raise ValueError("Model returned None output")
77
+
78
+ if hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
79
+ word_embeddings = outputs.hidden_states[-1]
80
+ debug_print(f"Using hidden_states, shape: {word_embeddings.shape}")
81
+ elif hasattr(outputs, 'last_hidden_state'):
82
+ word_embeddings = outputs.last_hidden_state
83
+ debug_print(f"Using last_hidden_state, shape: {word_embeddings.shape}")
84
+ else:
85
+ raise ValueError(f"Model output has neither hidden_states nor last_hidden_state. Available attributes: {dir(outputs)}")
86
+
87
+ # Extract input_ids and convert them back to tokens
88
+ if isinstance(inputs, dict):
89
+ input_ids = inputs['input_ids'][0].tolist()
90
+ else:
91
+ input_ids = inputs[0].tolist()
92
+ tokens = self.Tokenizer.tokenizer.convert_ids_to_tokens(input_ids)
93
+
94
+ # Now align the tokens and embeddings
95
+ for token, embedding in zip(tokens, word_embeddings[0]):
96
+ embeddings[token]=embedding.tolist()
97
+
98
+ else:
99
+ if self.model_name == 'fastText':
100
+ embeddings = []
101
+ for token in inputs:
102
+ embeddings.append((token, self.model_instance.get_word_vector(token)))
103
+
104
+ doc_entry_list.append(embeddings)
105
+
106
+ return doc_entry_list, len(inputs)
@@ -22,7 +22,12 @@ class LogitsExtractor:
22
22
  chunk_size = self.options['chunk_size']
23
23
  overlap_size = self.options['overlap_size']
24
24
 
25
- input_ids = tokens.to(self.device)
25
+ # Convert list of token IDs to tensor if needed
26
+ if isinstance(tokens, list):
27
+ input_ids = torch.tensor([tokens], device=self.device)
28
+ else:
29
+ input_ids = tokens.to(self.device)
30
+
26
31
  chunks = self._split_into_chunks(input_ids, chunk_size, overlap_size)
27
32
 
28
33
  per_token_data = []
@@ -4,7 +4,7 @@ import os
4
4
  import shutil
5
5
 
6
6
  from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
7
- from transformers import AutoModelForCausalLM
7
+ from transformers import AutoModelForCausalLM, AutoModelForMaskedLM, AutoModel
8
8
 
9
9
  class Model:
10
10
  def __init__(self, model_name, project_path):
@@ -75,7 +75,6 @@ class Model:
75
75
 
76
76
  print(f'FastText model loaded successfully from {model_path}')
77
77
  elif self.model_name == 'xlm-roberta-base':
78
- from transformers import AutoModel
79
78
  self.model_instance = AutoModel.from_pretrained(
80
79
  self.model_name,
81
80
  trust_remote_code=trust_remote_code,
@@ -25,9 +25,10 @@ from pelican_nlp.utils.setup_functions import subject_instantiator, load_config,
25
25
  from pelican_nlp.preprocessing import LPDS
26
26
  from pelican_nlp.utils.filename_parser import parse_lpds_filename
27
27
 
28
- from pelican_nlp.config import debug_print
28
+ from pelican_nlp.config import debug_print, RUN_TESTS, run_tests
29
29
 
30
30
  project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_fluency/config_fluency.yml'
31
+ #project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_discourse/config_discourse.yml'
31
32
 
32
33
  class Pelican:
33
34
 
@@ -40,12 +41,12 @@ class Pelican:
40
41
  # If no config path is provided, use the default config from package; used for dev-mode
41
42
  if config_path is None:
42
43
  package_dir = Path(__file__).parent
43
- default_config = package_dir / 'configuration_files' / 'config_fluency.yml'
44
+ default_config = package_dir / 'sample_configuration_files' / 'config_fluency.yml'
44
45
  if default_config.exists():
45
46
  config_path = str(default_config)
46
47
  print(f"Using default configuration file: {config_path}")
47
48
  else:
48
- sys.exit('Error: Default configuration file not found in package.')
49
+ sys.exit('Error: Default configuration file not found in sample_configuration_files folder.')
49
50
 
50
51
  # Verify the provided path is a YAML file
51
52
  elif not config_path.endswith(('.yml', '.yaml')):
@@ -72,12 +73,6 @@ class Pelican:
72
73
  """Execute the main processing pipeline."""
73
74
  self._clear_gpu_memory()
74
75
 
75
- '''
76
- #run unittests in dev_mode; not yet implemented
77
- if self.dev_mode:
78
- self._run_tests()
79
- '''
80
-
81
76
  self._handle_output_directory()
82
77
 
83
78
  # Check/Create LPDS
@@ -229,4 +224,8 @@ class Pelican:
229
224
 
230
225
 
231
226
  if __name__ == '__main__':
232
- Pelican(project_path, dev_mode=True).run()
227
+ if RUN_TESTS:
228
+ print("Running tests...")
229
+ run_tests()
230
+ else:
231
+ Pelican(project_path, dev_mode=True).run()
@@ -24,7 +24,8 @@ class TextTokenizer:
24
24
  # Tokenize using the model's tokenizer
25
25
  return self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=self.max_sequence_length).to(self.device_used)
26
26
  elif method == 'model':
27
- return self.tokenizer.encode(text, return_tensors='pt')
27
+ # For model method, return token IDs directly
28
+ return self.tokenizer.encode(text, add_special_tokens=True)
28
29
  else:
29
30
  raise ValueError(f"Unsupported tokenization method: {method}")
30
31
 
@@ -34,10 +35,14 @@ class TextTokenizer:
34
35
  def get_tokenizer(self):
35
36
  if self.tokenization_method == 'model' or self.tokenization_method == 'model_roberta':
36
37
  from transformers import AutoTokenizer
38
+ if not self.model_name:
39
+ raise ValueError("model_name must be provided for model-based tokenization methods")
37
40
  return AutoTokenizer.from_pretrained(
38
41
  self.model_name,
39
42
  trust_remote_code=False, # Don't execute arbitrary model code
40
43
  use_safetensors=True
41
44
  )
42
- else:
45
+ elif self.tokenization_method == 'whitespace':
43
46
  return None
47
+ else:
48
+ raise ValueError(f"Unsupported tokenization method: {self.tokenization_method}")