PyPI - pelican-nlp - Versions diffs - 0.3.2__tar.gz → 0.3.4__tar.gz - Mend

pelican-nlp 0.3.2tar.gz → 0.3.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (128) hide show

{pelican_nlp-0.3.2/pelican_nlp.egg-info → pelican_nlp-0.3.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pelican_nlp
-Version: 0.3.2
+Version: 0.3.4
 Summary: Preprocessing and Extraction of Linguistic Information for Computational Analysis
 Author-email: Yves Pauli <yves.pauli@gmail.com>
 License-Expression: CC-BY-NC-4.0

{pelican_nlp-0.3.2/examples/PyPI_testing_discourse → pelican_nlp-0.3.4/examples/example_discourse}/config_discourse.yml RENAMED Viewed

@@ -7,11 +7,15 @@ discourse: &discourse_flag true
 #general configurations; always adapt
 language: "german" # Possibly add options for German and English
-task_name: "interview" # Give name of task used for creation of the input file (e.g., ['fluency', 'interview'])
-corpus_names:
+task_name: "interview"
+#Create analysis corpus, group files based on corpus entity.
+corpus_key: "acq"
+corpus_values: #group names
+  - "placebo"
   - "schizophrenia"
-metric_to_extract: "embeddings" #Possible options: 'logits' or 'embeddings'
+metric_to_extract: "logits" #Possible options: 'logits' or 'embeddings'
 number_of_speakers: 3
 subject_speakertag: "B"
@@ -43,7 +47,7 @@ options_logits:
   keep_speakertags: true
 options_embeddings:
-  tokenization_method: "model_roberta" #or "whitespace", "model"
+  tokenization_method: "model" #"model" or "whitespace"
   max_length: 512 #max sequence length
   model_name: "xlm-roberta-base" #e.g. "fastText", "xlm-roberta-base"
   pytorch_based_model: true
@@ -59,10 +63,10 @@ options_embeddings:
     remove_punctuation_and_symbols: true
     remove_brackets_and_content: true
   semantic-similarity: false
+  distance-from-randomness: false
   window_size: null
   clean_tokens: false
-  distance-from-randomness: false
+  divergence_from_optimality: false
 #================================================================================
 #Extra configurations:
@@ -93,13 +97,13 @@ normalization_options:
   method: "lemmatization" #Options: lemmatization or stemming
 #================================================================
+create_aggregation_of_results: false
+output_document_information: false
 #Detail configurations; Changes optional, mostly used for quality checking / error handling
 number_of_subjects: null # Specify number of subjects; if 'null', number of subjects is automatically detected
 multiple_sessions: false # Set to True if multiple sessions per subject
 recompute_everything: true #If set to 'false' pelican-nlp will try to reuse previously computed results stored on your drive
-create_aggregation_of_results: false
-output_document_information: false

pelican_nlp-0.3.2/examples/PyPI_testing_discourse/subjects/sub-01/interview/sub-01_interview_schizophrenia_run-01.rtf → pelican_nlp-0.3.4/examples/example_discourse/subjects/sub-01/interview/sub-01_task-interview_acq-schizophrenia_run-01_transcript.rtf RENAMED Viewed

@@ -1,40 +1,40 @@
 {\rtf1\ansi\deff3\adeflang1025
-{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\froman\fprq2\fcharset0 Arial;}{\f5\froman\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f6\fnil\fprq2\fcharset0 Noto Sans CJK SC;}{\f7\fnil\fprq2\fcharset0 0;}{\f8\fnil\fprq2\fcharset0 Noto Sans Devanagari;}}
+{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\froman\fprq2\fcharset0 Arial;}{\f5\froman\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f6\fnil\fprq2\fcharset0 0;}{\f7\fnil\fprq2\fcharset0 Noto Sans CJK SC;}{\f8\fnil\fprq2\fcharset0 Noto Sans Devanagari;}}
 {\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
-{\stylesheet{\s0\snext0\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052 Normal;}
+{\stylesheet{\s0\snext0\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052 Normal;}
 {\*\cs15\snext15 Footnote Characters;}
 {\*\cs16\snext16\rtlch\ab \ltrch\loch\b Strong;}
-{\s17\sbasedon0\snext18\rtlch\af8\afs28\alang1081 \ltrch\lang1033\langfe2052\hich\af5\loch\ql\nowidctlpar\hyphpar1\sb240\sa120\keepn\ltrpar\cf0\f5\fs28\lang1033\kerning1\dbch\af6\langfe2052 Heading;}
-{\s18\sbasedon0\snext18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052 Body Text;}
-{\s19\sbasedon18\snext19\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052 List;}
-{\s20\sbasedon0\snext20\rtlch\af8\afs24\alang1081\ai \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\sb120\sa120\ltrpar\cf0\f3\fs24\lang1033\i\kerning1\dbch\af7\langfe2052 Caption;}
-{\s21\sbasedon0\snext21\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052 Index;}
+{\s17\sbasedon0\snext18\rtlch\af8\afs28\alang1081 \ltrch\lang1033\langfe2052\hich\af5\loch\ql\nowidctlpar\hyphpar1\sb240\sa120\keepn\ltrpar\cf0\f5\fs28\lang1033\kerning1\dbch\af7\langfe2052 Heading;}
+{\s18\sbasedon0\snext18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052 Body Text;}
+{\s19\sbasedon18\snext19\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052 List;}
+{\s20\sbasedon0\snext20\rtlch\af8\afs24\alang1081\ai \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\sb120\sa120\ltrpar\cf0\f3\fs24\lang1033\i\kerning1\dbch\af6\langfe2052 Caption;}
+{\s21\sbasedon0\snext21\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052 Index;}
 }{\*\generator LibreOffice/24.2.7.2$Linux_X86_64 LibreOffice_project/420$Build-2}{\info{\title 648866ebdbd870441d179a92}{\author Thomas Luthi-Bhatti}{\creatim\yr2023\mo6\dy14\hr17\min6}{\revtim\yr2025\mo4\dy8\hr13\min51}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops{\propname Operator}\proptype30{\staticval Ulrike Rachner}}\deftab720
 \hyphauto1\viewscale100\formshade\paperh16838\paperw11906\margl1417\margr1417\margt1417\margb1398\sectd\sbknone\sftnnar\saftnnrlc\sectunlocked1\pgwsxn11906\pghsxn16838\marglsxn1417\margrsxn1417\margtsxn1417\margbsxn1398\ftnbj\ftnstart1\ftnrestart\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
-{\*\ftnsep\chftnsep}\pgndec\pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052\sl100\slmult0\qc\hyphpar0\fi0\li0\lin0\ri0\rin0\sb238\sa0{\hich\af4\loch\cs16\rtlch\ab \ltrch\loch\b\fs22\lang1031\f4\loch
+{\*\ftnsep\chftnsep}\pgndec\pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl100\slmult0\qc\hyphpar0\fi0\li0\lin0\ri0\rin0\sb238\sa0{\hich\af4\loch\cs16\rtlch\ab \ltrch\loch\b\fs22\lang1031\f4\loch
 Interview with Interviewee}
-\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052\sl100\slmult0\qc\hyphpar0\fi0\li0\lin0\ri0\rin0\sb238\sa0\loch
+\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl100\slmult0\qc\hyphpar0\fi0\li0\lin0\ri0\rin0\sb238\sa0\loch
-\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
+\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
 I: Das ist f\u252\'fcr mich. Ich m\u246\'f6chte, dass Sie \u252\'fcber ein paar Dinge aus Ihrem t\u228\'e4glichen Leben sprechen. Sie m\u252\'fcssen (keinerlei?) Namen nennen, w\u228\'e4hrend Sie dieses Ereignis beschreiben. K\u246\'f6nnen Sie mir ein wenig \u252\'fcber sich erz\u228\'e4hlen? #00:00:14-00#\line B: (In Schriftsprache.) Ja, nat\u252\'fcrlich. Jeden Morgen beginne ich den Tag mit einer Tasse Tee. Ich bin jemand, der viel Wert auf eine ruhige Morgenroutine legt. Es ist f\u252\'fcr mich sehr wichtig, dass der Start in den Tag entspannt und nicht hektisch ist. Oft lese ich auch ein paar Seiten in einem Buch, das ich gerade lese. Danach gehe ich meistens zur Arbeit, entweder ins B\u252\'fcro oder arbeite von zu Hause aus. Mein Job ist sehr abwechslungsreich, und es gef\u228\'e4llt mir, immer neue Herausforderungen zu haben. Am Nachmittag gehe ich oft spazieren oder treffe mich mit Freunden. Ein gutes Gespr\u228\'e4ch oder eine kleine Wanderung in der Natur tut mir immer sehr gut. Am Abend koche ich gerne etwas Leckeres und entspanne mich beim Fernsehen oder h\u246\'f6re Musik. #00:00:51-00#}
-\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
+\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
 I: Wenn Sie zur\u252\'fcckdenken, k\u246\'f6nnen Sie mir eine Geschichte \u252\'fcber etwas Wichtiges erz\u228\'e4hlen, das in Ihrem Leben passiert ist? Die Geschichte kann aus einer beliebigen Zeit Ihres Lebens stammen, aus Ihrer Kindheit oder auch vor Kurzem. Sie brauchen keine Namen zu nennen, wenn Sie dieses Ereignis beschreiben. #00:04:19-00#\line B: Ich erinnere mich an eine Zeit, als ich mit meiner Familie in einem kleinen Dorf auf einem Berg war. Es war ein Winterwochenende, und wir hatten viel Schnee. An diesem Tag sind wir alle zusammen mit Schlitten den Hang hinuntergefahren. Es war eine sehr lustige Erfahrung, weil wir alle wie Kinder waren, trotz des Alters. Aber was mir wirklich in Erinnerung geblieben ist, war, dass ich mich nach diesem Tag viel n\u228\'e4her mit meiner Familie verbunden f\u252\'fchlte. Es war ein Moment, in dem wir uns alle unterst\u252\'fctzt und gemeinsam gelacht haben, was damals sehr wichtig f\u252\'fcr mich war. Diese Momente mit der Familie sind f\u252\'fcr mich unersetzlich. #00:05:42-00#}
-\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
+\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
 I: Ich m\u246\'f6chte Sie nun bitten, dass Sie ein wenig \u252\'fcber Ihre Gesundheit sprechen. Sie brauchen keine Namen zu nennen. Glauben Sie, dass Sie eine psychische Krankheit haben? Und wenn ja, worum handelt es sich Ihrer Meinung nach? #00:06:03-00#\line B: In letzter Zeit habe ich mich mehr mit meiner mentalen Gesundheit besch\u228\'e4ftigt. Ich w\u252\'fcrde sagen, dass ich in einer stabilen psychischen Verfassung bin, aber es gibt Momente, in denen ich mich \u252\'fcberfordert f\u252\'fchle. Gerade in stressigen Phasen merke ich, dass es schwieriger f\u252\'fcr mich ist, den Kopf klar zu behalten. Aber ich versuche, mir Hilfe zu suchen und achte sehr darauf, auf mich selbst zu h\u246\'f6ren. Es gibt Phasen, in denen ich das Gef\u252\'fchl habe, dass ich eine kurze Auszeit brauche, um mich wieder zu sortieren. Aber insgesamt denke ich, dass ich psychisch gesund bin, solange ich mir genug Zeit f\u252\'fcr mich nehme. #00:06:48-00#}
-\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
+\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
 I: Und wurde Ihnen eine Diagnose gestellt? #00:07:03-00#\line B: Nein, bisher nicht. #00:07:04-00#}
-\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
+\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
 I: Danke. Okay. Ich werde Ihnen jetzt drei Bilder zeigen, und eins nach dem anderen. Jedes Mal, wenn ich das Bild vor Sie lege, m\u246\'f6chte ich Sie bitten, das Bild so vollst\u228\'e4ndig wie m\u246\'f6glich zu beschreiben. Sagen Sie mir, was Sie auf dem Bild sehen und was Ihrer Meinung nach passieren k\u246\'f6nnte. Bitte sprechen Sie, bis ich Stopp sage. (...) Bild Nummer zwei. Bitte sagen Sie, was Sie auf diesem Bild sehen. #00:09:10-00#\line B: (Startet in Schriftsprache.) Auf diesem Bild sieht man eine Gruppe von Personen, die auf einem Markt stehen. Es ist ein lebhafter Ort, mit vielen St\u228\'e4nden und bunten Waren. In der Mitte sieht man eine \u228\'e4ltere Frau, die gerade eine Melone ausw\u228\'e4hlt. Sie tr\u228\'e4gt eine einfache, aber stilvolle Kleidung. Links sieht man einen jungen Mann, der mit einem Verk\u228\'e4ufer spricht, der gerade Tomaten in eine T\u252\'fcte packt. Im Hintergrund sieht man weitere Marktst\u228\'e4nde, die mit Obst und Gem\u252\'fcse voll sind. Der Himmel ist bew\u246\'f6lkt, und es sieht aus, als w\u252\'fcrde es bald regnen. Es scheint ein sch\u246\'f6ner, aber auch sehr besch\u228\'e4ftigter Tag zu sein. #00:10:37-00#}
-\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
+\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
 I: Danke sch\u246\'f6n. Bild Nummer vier. Was passiert auf diesem Bild? Oder was sehen Sie auf diesem Bild? #00:10:46-00#\line B: Auf diesem Bild sieht man einen Mann und eine Frau, die zusammen auf einer Bank sitzen. Der Mann ist in einem Anzug und schaut auf sein Handy. Die Frau tr\u228\'e4gt ein sommerliches Kleid und schaut nachdenklich in die Ferne. Sie scheint in einer anderen Welt zu sein, w\u228\'e4hrend der Mann abgelenkt ist. Im Hintergrund ist ein Park zu sehen, mit B\u228\'e4umen und einem kleinen See. Die Stimmung wirkt ein bisschen melancholisch, als ob beide Menschen in Gedanken versunken sind. Es scheint, als ob sie ein Gespr\u228\'e4ch f\u252\'fchren, aber jeder ist in seiner eigenen Welt. #00:12:00-00#}
-\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
+\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
 I: Danke sch\u246\'f6n. Und Bild Nummer 17GF, was sehen Sie auf diesem Bild? #00:12:09-00#\line B: Auf diesem Bild sieht man einen alten Leuchtturm, der auf einem Felsen \u252\'fcber dem Meer thront. Der Himmel ist dramatisch, mit dunklen Wolken und einer Art Sturmstimmung. Das Meer ist unruhig und st\u252\'fcrmisch, und man sieht die Wellen gegen den Felsen schlagen. In der N\u228\'e4he des Leuchtturms ist ein kleiner, alter Kutter zu sehen, der versucht, gegen die Wellen anzukommen. Es wirkt wie eine dramatische Szene, bei der der Leuchtturm als Rettungsanker in dieser st\u252\'fcrmischen See dient. Der Leuchtturm strahlt ein warmes Licht aus, das den Kutter zu f\u252\'fchren scheint. #00:13:23-00#}
-\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
+\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
 I: Danke. Gut, ich werde Ihnen nun einige Bilder aus einer Geschichte zeigen. Sie k\u246\'f6nnen sich so viel Zeit nehmen, wie Sie brauchen, um die Bilder anzuschauen. Nachdem Sie alle Bilder der Reihe nach angesehen haben, m\u246\'f6chte ich Sie bitten, mir die Geschichten auf den Bildern in Ihren eigenen Worten zu erz\u228\'e4hlen. Das ist die Geschichte. #00:13:47-00#\line B: Ich habe in der Kindheit oft getr\u228\'e4umt, dass ich in einem Wald unterwegs war. #00:15:59-00#}
-\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
+\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
 I: Im Wald? #00:15:56-00#\line B: Ja, genau. Ich war als Kind oft drau\u223\'dfen in den W\u228\'e4ldern, und in meinen Tr\u228\'e4umen bin ich immer tiefer in den Wald gegangen. Eines Tages kam ich an einen kleinen Bach, der durch den Wald floss. Der Bach war klar und das Wasser funkelte im Sonnenlicht. Ich sa\u223\'df dort und beobachtete die Fische, die durch das Wasser schwammen. Es war sehr ruhig, und ich f\u252\'fchlte mich v\u246\'f6llig friedlich. In diesem Moment hatte ich das Gef\u252\'fchl, dass ich ein Teil der Natur war und mit der Welt um mich herum eins. Es war ein sch\u246\'f6ner, friedlicher Traum, der mir auch als Erwachsener oft in den Sinn kommt. #00:16:44-00#}
-\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\sl276\slmult1\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052{\loch
+\par \pard\plain \s18\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\sl276\slmult1\ql\nowidctlpar\hyphpar1\sb0\sa140\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl276\slmult1{\loch
 I: Gut. Jetzt haben Sie eine Minute Zeit, um den Text durchzugehen, danach bitte ich Sie, das Blatt wegzulegen und mir die Geschichte in eigenen Worten zu erz\u228\'e4hlen. #00:17:51-00#\line B: Okay. #00:17:52-00#\line (Stille. B liest. #00:17:52-00# - #00:19:13-00#)\line B: (in Schriftsprache.) Die Geschichte handelt von einer kleinen Katze, die an einem sehr hei\u223\'dfen Tag im Schatten eines Baumes schl\u228\'e4ft. Sie tr\u228\'e4umt von einem k\u252\'fchlen Teich, an dem sie trinken kann. Als sie aufwacht, ist der Teich nicht mehr weit, und die Katze folgt einem Schmetterling, der sie zu einem geheimen, versteckten Ort f\u252\'fchrt. Der Teich ist klar, und die Katze kann endlich ihren Durst l\u246\'f6schen. Sie ist sehr zufrieden und kehrt sp\u228\'e4ter zur\u252\'fcck zu ihrem Baum, um sich wieder auszuruhen. #00:20:09-00#\line I: Danke sch\u246\'f6n.}
-\par \pard\plain \s0\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af7\langfe2052\sl100\slmult0\qc\hyphpar0\fi0\li0\lin0\ri0\rin0\sb238\sa0\loch
+\par \pard\plain \s0\rtlch\af8\afs24\alang1081 \ltrch\lang1033\langfe2052\hich\af3\loch\ql\nowidctlpar\hyphpar1\ltrpar\cf0\f3\fs24\lang1033\kerning1\dbch\af6\langfe2052\sl100\slmult0\qc\hyphpar0\fi0\li0\lin0\ri0\rin0\sb238\sa0\loch
 \par }

{pelican_nlp-0.3.2/pelican_nlp/configuration_files → pelican_nlp-0.3.4/examples/example_fluency}/config_fluency.yml RENAMED Viewed

@@ -8,7 +8,8 @@ fluency_task: &fluency_flag true
 language: "german"
 multiple_sessions: &session_flag false
-corpus_names: #names of fluency tasks (e.g. "animals", "clothes")
+corpus_key: "acq"
+corpus_values: #names of fluency tasks (e.g. "animals", "clothes")
   - "animals"
   - "clothes"
   - "food"
@@ -103,5 +104,3 @@ filename_components:
   metric: true
   additional_tags: []

pelican_nlp-0.3.4/examples/example_general/config_general.yml ADDED Viewed

@@ -0,0 +1,146 @@
+# Master Configuration File
+# ========================
+# Basic Settings
+# -------------
+input_file: "text"  # Options: 'text' or 'audio'
+language: "german"
+recompute_everything: true  # If false will give warning if output folder already exists
+# Task Configuration
+# -----------------
+task_name: null  # Name of task used for creation of data
+fluency_task: &fluency_flag false  # Flag for fluency-specific settings
+discourse: &discourse_flag false  # Flag for discourse-specific settings
+# Corpus Configuration
+# ------------------
+corpus_key: null # Entity key to group files for analysis
+corpus_values:  # Corresponding entity values found in dataset
+  - "healthy-control"
+  - "placebo"
+# Session and Subject Settings
+# --------------------------
+multiple_sessions: false
+number_of_subjects: null  # If null, auto-detected
+number_of_speakers: 1 # Specify amount of speakers for discourse files
+subject_speakertag: null  # Speaker tag for subject (e.g., "B"), only for discourse
+# Document Structure
+# ----------------
+has_multiple_sections: false
+has_section_titles: false
+section_identification: null  # e.g., "Section:", in case of multiple sections
+number_of_sections: null  # If null, auto-detected, specify for multiple sections to check section detection
+# Processing Pipeline
+# -----------------
+pipeline_options: # Just for data preprocessing without metric extraction
+  quality_check: false
+  clean_text: true
+  tokenize_text: false
+  normalize_text: false
+# Metric Extraction
+# ---------------
+metric_to_extract: "embeddings"  # Options: 'embeddings', 'logits'
+output_document_information: true
+# Cleaning Options
+# --------------
+cleaning_options:
+  general_cleaning: true # General cleaning applied to most datasets, check specifications in section "general_cleaning_options"
+  remove_punctuation: false
+  lowercase: true
+  remove_brackets_and_bracketcontent: false
+  remove_timestamps: false
+  timestamp_pattern_example: null  # e.g., "#00:00:23-00#", only if remove_timestamps = True
+  # Fluency-specific options
+  fluency_task: *fluency_flag
+  word_splitter: ';'
+  remove_hyphens: true
+  remove_duplicates: true
+general_cleaning_options:
+  strip_whitespace: true
+  merge_multiple_whitespaces: true
+  remove_whitespace_before_punctuation: true
+  merge_newline_characters: true
+  remove_backslashes: true
+# Embedding Options
+# ---------------
+options_embeddings:
+  tokenization_method: "whitespace"  # Options: 'whitespace', 'model'
+  model_name: "fastText"  # Options: 'fastText', 'xlm-roberta-base'
+  pytorch_based_model: false
+  method: "model_instance"
+  max_length: 512
+  clean_embedding_tokens: true
+  remove_punctuation: false
+  lowercase: false
+  keep_speakertags: false
+  semantic-similarity: true
+  window_size: null
+  clean_tokens: true
+  divergence_from_optimality: false
+  output_options:
+    exclude_special_tokens: true
+    remove_'_'_character: true
+    remove_speaker_labels: true
+    remove_punctuation_and_symbols: true
+    remove_brackets_and_content: true
+# Logits Options
+# -------------
+options_logits:
+  chunk_size: 128
+  overlap_size: 64
+  tokenization_method: "model"
+  model_name: "DiscoResearch/Llama3-German-8B-32k"
+  remove_punctuation: true
+  lowercase: true
+  keep_speakertags: true
+# Analysis Options
+# --------------
+options_semantic-similarity:
+  window_sizes:  # 'all' or window size as integer
+    - 2
+    - 8
+options_dis_from_randomness:
+  window_size: 8
+  min_len: null
+  bootstrap: 10000
+  shuffle_mode: 'include0_includeN'
+  parallel_computing: false
+# Normalization Options
+# -------------------
+normalization_options:
+  method: "lemmatization"  # Options: 'lemmatization', 'stemming'
+# Document Information Output
+# -------------------------
+document_information_output:
+  parameters:
+    - subject_ID
+    - fluency_word_count
+    - fluency_duplicate_count
+# Filename Configuration
+# --------------------
+filename_components:
+  subject: true    # mandatory
+  session: false
+  task: true       # mandatory
+  task_addition: false
+  corpus: true     # mandatory
+  metric: true
+  additional_tags: []
+# Additional Settings
+# -----------------
+create_aggregation_of_results: true

pelican_nlp-0.3.4/pelican_nlp/_version.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ __version__ = "0.3.4"

pelican_nlp-0.3.4/pelican_nlp/cli.py ADDED Viewed

@@ -0,0 +1,36 @@
+import os
+from pathlib import Path
+from pelican_nlp.main import Pelican
+from pelican_nlp.config import RUN_TESTS, run_tests
+def main():
+    # Run tests if enabled
+    if RUN_TESTS:
+        print("Running tests...")
+        run_tests()
+        return
+    # Look for configuration files in the current working directory
+    config_dir = Path.cwd()
+    print(f"Looking for configuration files in: {config_dir}")
+    config_files = [f for f in os.listdir(config_dir) if f.endswith((".yml", ".yaml"))]
+    if not config_files:
+        print("No .yml or .yaml configuration file found in the current directory.")
+        print("Please ensure you have a configuration file in your current working directory.")
+        return
+    if len(config_files) > 1:
+        print("Warning: Multiple configuration files found in current directory:")
+        for i, file in enumerate(config_files, 1):
+            print(f"  {i}. {file}")
+        print("Please ensure only one configuration file is present in the current directory.")
+        return
+    config_file = str(config_dir / config_files[0])
+    print(f"Using configuration file: {config_file}")
+    pelican = Pelican(config_file)
+    pelican.run()

pelican_nlp-0.3.4/pelican_nlp/config.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""
+Global configuration settings for the Pelican project.
+This file is not the configuration.yml file created for the users adaptations.
+For consistency of pipeline, DO NOT CHANGE.
+"""
+# Debug flag
+DEBUG_MODE = True
+# Test flag - set to True to run all example tests
+RUN_TESTS = False
+def debug_print(*args, **kwargs):
+    """Print only if debug mode is enabled."""
+    DEBUG_MODE = True
+    if DEBUG_MODE:
+        print(*args, **kwargs)
+def run_tests():
+    """Run all example tests if RUN_TESTS is enabled."""
+    if RUN_TESTS:
+        import unittest
+        from pathlib import Path
+        # Get the path to the test file
+        test_file = Path(__file__).parent / "utils" / "unittests" / "test_examples.py"
+        # Create a test suite and add the test file
+        loader = unittest.TestLoader()
+        suite = loader.discover(str(test_file.parent), pattern="test_examples.py")
+        # Run the tests
+        runner = unittest.TextTestRunner(verbosity=2)
+        runner.run(suite)

pelican_nlp-0.3.4/pelican_nlp/extraction/extract_embeddings.py ADDED Viewed

@@ -0,0 +1,106 @@
+from pelican_nlp.extraction.language_model import Model
+from pelican_nlp.preprocessing.text_tokenizer import TextTokenizer
+from pelican_nlp.config import debug_print
+class EmbeddingsExtractor:
+    def __init__(self, embeddings_configurations, project_path):
+        self.embeddings_configurations = embeddings_configurations
+        self.model_name = embeddings_configurations['model_name']  # Embedding model instance (e.g., fastText, RoBERTa)
+        self.model = Model(self.model_name, project_path)
+        self.Tokenizer = TextTokenizer(self.embeddings_configurations['tokenization_method'], self.model_name,
+                                       self.embeddings_configurations['max_length'])
+        self.model.load_model()
+        self.model_instance = self.model.model_instance
+    def extract_embeddings_from_text(self, text_list):
+        doc_entry_list = []
+        for text in text_list:
+            embeddings = {}
+            # Tokenize the input text
+            inputs = self.Tokenizer.tokenize_text(text)
+            debug_print(f'inputs are: {inputs}')
+            if self.embeddings_configurations['pytorch_based_model']:
+                #e.g. RoBERTa Model or Llama Model
+                import torch
+                with torch.no_grad():
+                    if 'llama' in self.model_name.lower():
+                        # Handle Llama models which expect input_ids directly
+                        outputs = self.model_instance(input_ids=inputs['input_ids'])
+                    else:
+                        # Handle RoBERTa and other models that accept **inputs
+                        if isinstance(inputs, dict):
+                            # Ensure inputs are on the same device as the model
+                            inputs = {k: v.to(self.model_instance.device) for k, v in inputs.items()}
+                            debug_print(f"Model inputs: {inputs}")
+                            outputs = self.model_instance(**inputs, output_hidden_states=True)
+                        else:
+                            debug_print(f"Input type: {type(inputs)}")
+                            debug_print(f"Input content: {inputs}")
+                            # If inputs is a list of strings, convert to token IDs first
+                            if isinstance(inputs, list):
+                                if isinstance(inputs[0], str):
+                                    # Convert tokens to IDs
+                                    token_ids = self.Tokenizer.tokenizer.convert_tokens_to_ids(inputs)
+                                    debug_print(f"Token IDs: {token_ids}")
+                                    inputs = torch.tensor([token_ids], device=self.model_instance.device)
+                                else:
+                                    # If it's already a list of numbers, convert directly
+                                    inputs = torch.tensor([inputs], device=self.model_instance.device)
+                            else:
+                                # If it's already a tensor, just move to device
+                                inputs = inputs.to(self.model_instance.device)
+                            debug_print(f"Final tensor shape: {inputs.shape}")
+                            # Ensure proper shape
+                            if len(inputs.shape) == 1:
+                                inputs = inputs.unsqueeze(0)  # Add batch dimension
+                            # Create attention mask
+                            attention_mask = torch.ones_like(inputs)
+                            debug_print(f"Model inputs - input_ids: {inputs.shape}, attention_mask: {attention_mask.shape}")
+                            outputs = self.model_instance(input_ids=inputs, attention_mask=attention_mask, output_hidden_states=True)
+                            debug_print(f"Model outputs type: {type(outputs)}")
+                            debug_print(f"Model outputs attributes: {dir(outputs)}")
+                # Get word embeddings (last hidden state)
+                if outputs is None:
+                    raise ValueError("Model returned None output")
+                if hasattr(outputs, 'hidden_states') and outputs.hidden_states is not None:
+                    word_embeddings = outputs.hidden_states[-1]
+                    debug_print(f"Using hidden_states, shape: {word_embeddings.shape}")
+                elif hasattr(outputs, 'last_hidden_state'):
+                    word_embeddings = outputs.last_hidden_state
+                    debug_print(f"Using last_hidden_state, shape: {word_embeddings.shape}")
+                else:
+                    raise ValueError(f"Model output has neither hidden_states nor last_hidden_state. Available attributes: {dir(outputs)}")
+                # Extract input_ids and convert them back to tokens
+                if isinstance(inputs, dict):
+                    input_ids = inputs['input_ids'][0].tolist()
+                else:
+                    input_ids = inputs[0].tolist()
+                tokens = self.Tokenizer.tokenizer.convert_ids_to_tokens(input_ids)
+                # Now align the tokens and embeddings
+                for token, embedding in zip(tokens, word_embeddings[0]):
+                    embeddings[token]=embedding.tolist()
+            else:
+                if self.model_name == 'fastText':
+                    embeddings = []
+                    for token in inputs:
+                        embeddings.append((token, self.model_instance.get_word_vector(token)))
+            doc_entry_list.append(embeddings)
+        return doc_entry_list, len(inputs)

{pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/extraction/extract_logits.py RENAMED Viewed

@@ -22,7 +22,12 @@ class LogitsExtractor:
         chunk_size = self.options['chunk_size']
         overlap_size = self.options['overlap_size']
-        input_ids = tokens.to(self.device)
+        # Convert list of token IDs to tensor if needed
+        if isinstance(tokens, list):
+            input_ids = torch.tensor([tokens], device=self.device)
+        else:
+            input_ids = tokens.to(self.device)
         chunks = self._split_into_chunks(input_ids, chunk_size, overlap_size)
         per_token_data = []

{pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/extraction/language_model.py RENAMED Viewed

@@ -4,7 +4,7 @@ import os
 import shutil
 from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model
-from transformers import AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoModelForMaskedLM, AutoModel
 class Model:
     def __init__(self, model_name, project_path):
@@ -75,7 +75,6 @@ class Model:
             print(f'FastText model loaded successfully from {model_path}')
         elif self.model_name == 'xlm-roberta-base':
-            from transformers import AutoModel
             self.model_instance = AutoModel.from_pretrained(
                 self.model_name,
                 trust_remote_code=trust_remote_code,

{pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/main.py RENAMED Viewed

@@ -25,9 +25,10 @@ from pelican_nlp.utils.setup_functions import subject_instantiator, load_config,
 from pelican_nlp.preprocessing import LPDS
 from pelican_nlp.utils.filename_parser import parse_lpds_filename
-from pelican_nlp.config import debug_print
+from pelican_nlp.config import debug_print, RUN_TESTS, run_tests
 project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_fluency/config_fluency.yml'
+#project_path = '/home/yvespauli/PycharmProjects/PyPI_testing_discourse/config_discourse.yml'
 class Pelican:
@@ -40,12 +41,12 @@ class Pelican:
         # If no config path is provided, use the default config from package; used for dev-mode
         if config_path is None:
             package_dir = Path(__file__).parent
-            default_config = package_dir / 'configuration_files' / 'config_fluency.yml'
+            default_config = package_dir / 'sample_configuration_files' / 'config_fluency.yml'
             if default_config.exists():
                 config_path = str(default_config)
                 print(f"Using default configuration file: {config_path}")
             else:
-                sys.exit('Error: Default configuration file not found in package.')
+                sys.exit('Error: Default configuration file not found in sample_configuration_files folder.')
         # Verify the provided path is a YAML file
         elif not config_path.endswith(('.yml', '.yaml')):
@@ -72,12 +73,6 @@ class Pelican:
         """Execute the main processing pipeline."""
         self._clear_gpu_memory()
-        '''
-        #run unittests in dev_mode; not yet implemented
-        if self.dev_mode:
-            self._run_tests()
-        '''
         self._handle_output_directory()
         # Check/Create LPDS
@@ -229,4 +224,8 @@ class Pelican:
 if __name__ == '__main__':
-    Pelican(project_path, dev_mode=True).run()
+    if RUN_TESTS:
+        print("Running tests...")
+        run_tests()
+    else:
+        Pelican(project_path, dev_mode=True).run()

{pelican_nlp-0.3.2 → pelican_nlp-0.3.4}/pelican_nlp/preprocessing/text_tokenizer.py RENAMED Viewed

@@ -24,7 +24,8 @@ class TextTokenizer:
             # Tokenize using the model's tokenizer
             return self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=self.max_sequence_length).to(self.device_used)
         elif method == 'model':
-            return self.tokenizer.encode(text, return_tensors='pt')
+            # For model method, return token IDs directly
+            return self.tokenizer.encode(text, add_special_tokens=True)
         else:
             raise ValueError(f"Unsupported tokenization method: {method}")
@@ -34,10 +35,14 @@ class TextTokenizer:
     def get_tokenizer(self):
         if self.tokenization_method == 'model' or self.tokenization_method == 'model_roberta':
             from transformers import AutoTokenizer
+            if not self.model_name:
+                raise ValueError("model_name must be provided for model-based tokenization methods")
             return AutoTokenizer.from_pretrained(
                 self.model_name,
                 trust_remote_code=False,  # Don't execute arbitrary model code
                 use_safetensors=True
             )
-        else:
+        elif self.tokenization_method == 'whitespace':
             return None
+        else:
+            raise ValueError(f"Unsupported tokenization method: {self.tokenization_method}")

pelican-nlp 0.3.2__tar.gz → 0.3.4__tar.gz

pelican-nlp 0.3.2tar.gz → 0.3.4tar.gz