speechmetryflow 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. speechmetryflow/__init__.py +25 -0
  2. speechmetryflow/_version.py +34 -0
  3. speechmetryflow/cli.py +131 -0
  4. speechmetryflow/lexical/__init__.py +17 -0
  5. speechmetryflow/lexical/assets/__init__.py +103 -0
  6. speechmetryflow/lexical/assets/concreteness.tsv +39955 -0
  7. speechmetryflow/lexical/assets/familiarity_imageability.tsv +5554 -0
  8. speechmetryflow/lexical/assets/frequency.tsv +74289 -0
  9. speechmetryflow/lexical/assets/valence.tsv +13916 -0
  10. speechmetryflow/lexical/database.py +143 -0
  11. speechmetryflow/lexical/part_of_speech.py +172 -0
  12. speechmetryflow/lexical/utils.py +119 -0
  13. speechmetryflow/pragmatic/__init__.py +60 -0
  14. speechmetryflow/pragmatic/assets.py +62 -0
  15. speechmetryflow/pragmatic/database.py +113 -0
  16. speechmetryflow/semantic/__init__.py +23 -0
  17. speechmetryflow/semantic/assets.py +285 -0
  18. speechmetryflow/semantic/icu.py +33 -0
  19. speechmetryflow/semantic/idea_density.py +78 -0
  20. speechmetryflow/speech_production/__init__.py +62 -0
  21. speechmetryflow/speech_production/assets/__init__.py +58 -0
  22. speechmetryflow/speech_production/assets/words_fr.json +1 -0
  23. speechmetryflow/speech_production/fluency.py +42 -0
  24. speechmetryflow/speech_production/fragment.py +76 -0
  25. speechmetryflow/syntactic/__init__.py +119 -0
  26. speechmetryflow/syntactic/assets.py +40 -0
  27. speechmetryflow/syntactic/dependency.py +47 -0
  28. speechmetryflow/syntactic/sentences.py +189 -0
  29. speechmetryflow/ucsf_disfluency/__init__.py +23 -0
  30. speechmetryflow/utils.py +53 -0
  31. speechmetryflow-0.2.2.dist-info/METADATA +764 -0
  32. speechmetryflow-0.2.2.dist-info/RECORD +35 -0
  33. speechmetryflow-0.2.2.dist-info/WHEEL +4 -0
  34. speechmetryflow-0.2.2.dist-info/entry_points.txt +3 -0
  35. speechmetryflow-0.2.2.dist-info/licenses/LICENSE +674 -0
@@ -0,0 +1,25 @@
1
+ try:
2
+ from speechmetryflow._version import __version__
3
+ except ImportError:
4
+ pass
5
+
6
+ from speechmetryflow import (
7
+ lexical,
8
+ semantic,
9
+ speech_production,
10
+ syntactic,
11
+ pragmatic,
12
+ ucsf_disfluency,
13
+ utils,
14
+ )
15
+
16
+ __all__ = [
17
+ "__version__",
18
+ "lexical",
19
+ "semantic",
20
+ "speech_production",
21
+ "syntactic",
22
+ "pragmatic",
23
+ "ucsf_disfluency",
24
+ "utils",
25
+ ]
@@ -0,0 +1,34 @@
1
+ # file generated by setuptools-scm
2
+ # don't change, don't track in version control
3
+
4
+ __all__ = [
5
+ "__version__",
6
+ "__version_tuple__",
7
+ "version",
8
+ "version_tuple",
9
+ "__commit_id__",
10
+ "commit_id",
11
+ ]
12
+
13
+ TYPE_CHECKING = False
14
+ if TYPE_CHECKING:
15
+ from typing import Tuple
16
+ from typing import Union
17
+
18
+ VERSION_TUPLE = Tuple[Union[int, str], ...]
19
+ COMMIT_ID = Union[str, None]
20
+ else:
21
+ VERSION_TUPLE = object
22
+ COMMIT_ID = object
23
+
24
+ version: str
25
+ __version__: str
26
+ __version_tuple__: VERSION_TUPLE
27
+ version_tuple: VERSION_TUPLE
28
+ commit_id: COMMIT_ID
29
+ __commit_id__: COMMIT_ID
30
+
31
+ __version__ = version = '0.2.2'
32
+ __version_tuple__ = version_tuple = (0, 2, 2)
33
+
34
+ __commit_id__ = commit_id = None
speechmetryflow/cli.py ADDED
@@ -0,0 +1,131 @@
1
+ """"""
2
+
3
+ from typing import Dict
4
+
5
+ import click
6
+ import json
7
+ import spacy
8
+
9
+ from textdescriptives.utils import _download_spacy_model
10
+ from textdescriptives.extractors import extract_dict
11
+ from pathlib import Path
12
+
13
+ import speechmetryflow as smf
14
+ from speechmetryflow.utils import compute_stemming
15
+
16
+
17
+ @click.command()
18
+ @click.option("--lang", "-l", default="en")
19
+ @click.option("--task", "-t")
20
+ @click.option("--output", "-o", type=click.Path(dir_okay=False))
21
+ @click.option("--clobber", is_flag=True)
22
+ @click.argument("text_path", type=click.Path(exists=True, dir_okay=False))
23
+ def extract(
24
+ text_path: Path,
25
+ lang: str = None,
26
+ task: str = None,
27
+ output: Path = None,
28
+ clobber: bool = False,
29
+ ) -> Dict:
30
+ # Read input text_path
31
+ text_path = Path(text_path)
32
+ with text_path.open("r") as txt:
33
+ raw_text = " ".join(txt.read().split("\n"))
34
+
35
+ # Output
36
+ if output is None:
37
+ output = Path(f"{text_path.stem}_metrics-speechmetryflow.json")
38
+ else:
39
+ output = Path(output)
40
+
41
+ # Check if output file already exists
42
+ if output.exists() and not clobber:
43
+ raise FileExistsError(
44
+ f"File {output} already exists, use --clobber to overwrite."
45
+ )
46
+
47
+ # Disfluency metrics from UCSF
48
+ ucsf_disfluency_metrics = smf.ucsf_disfluency.metrics(raw_text)
49
+ text = smf.ucsf_disfluency.cleaning(raw_text)
50
+
51
+ # General cleaning
52
+ text = smf.utils.general_cleaning(text)
53
+
54
+ # TextDescriptives pipeline
55
+ spacy_model = _download_spacy_model(lang, "lg")
56
+ nlp = spacy.load(spacy_model)
57
+ nlp.add_pipe("textdescriptives/all")
58
+ doc = nlp(text)
59
+
60
+ tokens_filtered = [token for token in doc if not token.is_punct]
61
+ texts_filtered = [token.text for token in tokens_filtered]
62
+ lemmas_filtered = [token.lemma_ for token in tokens_filtered]
63
+ stems_filtered = compute_stemming(texts_filtered, lang)
64
+
65
+ # metrics containers
66
+ smf_metrics = {
67
+ "filename": text_path.name,
68
+ "participant_id": text_path.name.split("_")[0],
69
+ "language": lang,
70
+ "spacy_model": spacy_model,
71
+ }
72
+ smf_metrics.update(ucsf_disfluency_metrics)
73
+
74
+ td_metrics = {
75
+ "filename": text_path.name,
76
+ "participant_id": text_path.name.split("_")[0],
77
+ "language": lang,
78
+ "spacy_model": spacy_model,
79
+ }
80
+ td_metrics.update(extract_dict(doc)[0])
81
+
82
+ # speech_production
83
+ speech_production_metrics = smf.speech_production.metrics(
84
+ raw_text, texts_filtered, lemmas_filtered, lang, task
85
+ )
86
+ smf_metrics.update(speech_production_metrics)
87
+
88
+ # lexical
89
+ lexical_metrics = smf.lexical.metrics(doc, texts_filtered, stems_filtered, lang)
90
+ smf_metrics.update(lexical_metrics)
91
+
92
+ # semantic
93
+ semantic_metrics = smf.semantic.metrics(
94
+ raw_text, tokens_filtered, lang, task, speech_production_metrics["n_tokens"]
95
+ )
96
+ smf_metrics.update(semantic_metrics)
97
+
98
+ # syntactic
99
+ syntactic_metrics = smf.syntactic.metrics(doc, lang)
100
+ smf_metrics.update(syntactic_metrics)
101
+
102
+ # pragmatic
103
+ pragmatic_metrics = smf.pragmatic.metrics(
104
+ doc, lang, speech_production_metrics["n_tokens"]
105
+ )
106
+ smf_metrics.update(pragmatic_metrics)
107
+
108
+ # write output
109
+ with output.open("w", encoding="utf-8") as f:
110
+ json.dump(smf_metrics, f, indent=4)
111
+
112
+ output_td = Path(f"{text_path.stem}_metrics-textdescriptives.json")
113
+ with output_td.open("w", encoding="utf-8") as f:
114
+ json.dump(td_metrics, f, indent=4)
115
+
116
+
117
+ @click.command()
118
+ def download():
119
+ import nltk
120
+
121
+ nltk.download("words")
122
+
123
+ _download_spacy_model("en", "lg")
124
+ _download_spacy_model("fr", "lg")
125
+
126
+ from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline
127
+
128
+ AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion", legacy=False)
129
+ AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-emotion")
130
+ model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
131
+ pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
@@ -0,0 +1,17 @@
1
+ from speechmetryflow.lexical.part_of_speech import POS
2
+ from speechmetryflow.lexical.utils import (
3
+ count_deictic_pronouns,
4
+ count_indefinite_terms,
5
+ compute_honore_r_stat,
6
+ compute_brunet_index,
7
+ )
8
+
9
+
10
+ def metrics(doc, tokens, stems, lang):
11
+ pos = POS(doc, lang)
12
+ metrics = pos.metrics
13
+ metrics.update(count_deictic_pronouns(tokens, lang))
14
+ metrics.update(count_indefinite_terms(tokens, lang))
15
+ metrics["honore_r_stat"] = compute_honore_r_stat(stems)
16
+ metrics["brunet_w_index"] = compute_brunet_index(stems)
17
+ return metrics
@@ -0,0 +1,103 @@
1
+ # Correspondence between English and French POS labels
2
+ pos_mapping = {
3
+ "ADJ": "Adjectif",
4
+ "ADP": "Preposition",
5
+ "ADV": "Adverbe",
6
+ "AUX": "Auxiliaire",
7
+ "CONJ": "Conjonction",
8
+ "CCONJ": "Conjonction_de_coordination",
9
+ "DET": "Determinant",
10
+ "INTJ": "Interjection",
11
+ "NOUN": "Nom",
12
+ "NUM": "Numeral",
13
+ "PART": "Particule",
14
+ "PRON": "Pronom",
15
+ "PROPN": "Nom propre",
16
+ "PUNCT": "Ponctuation",
17
+ "SCONJ": "Conjonction_de_subordination",
18
+ "SYM": "Symbole",
19
+ "VERB": "Verbe",
20
+ "X": "Autre",
21
+ }
22
+
23
+ # Dictionary of deictic pronouns for each language
24
+ deictic_pronouns = {
25
+ "en": {
26
+ "spatial": {"here", "there", "this", "these", "that", "those"},
27
+ "personal": {"i", "you", "he", "she", "it", "we", "they"},
28
+ "temporal": {"now", "then", "soon", "tomorrow"},
29
+ },
30
+ "fr": {
31
+ "spatial": {
32
+ "ce",
33
+ "cet",
34
+ "cette",
35
+ "ces",
36
+ "celui-ci",
37
+ "celle-ci",
38
+ "ceux-ci",
39
+ "celles-ci",
40
+ "celui-là",
41
+ "celle-là",
42
+ "ceux-là",
43
+ "celles-là",
44
+ "y",
45
+ "en",
46
+ },
47
+ "personal": {"je", "tu", "il", "elle", "nous", "vous", "ils", "elles"},
48
+ "temporal": {"y", "en"},
49
+ },
50
+ }
51
+
52
+ # List of undefined terms in English and French
53
+ indefinite_terms = {
54
+ "en": [
55
+ "thing",
56
+ "stuff",
57
+ "anything",
58
+ "nothing",
59
+ "anyone",
60
+ "one",
61
+ "either",
62
+ "neither",
63
+ "everyone",
64
+ "no one",
65
+ "someone",
66
+ "anybody",
67
+ "everybody",
68
+ "nobody",
69
+ "somebody",
70
+ "another",
71
+ "the other",
72
+ "each",
73
+ "little",
74
+ "less",
75
+ "much",
76
+ "both",
77
+ "few",
78
+ "fewer",
79
+ "many",
80
+ "other",
81
+ "others",
82
+ "several",
83
+ ],
84
+ "fr": [
85
+ "truc",
86
+ "chose",
87
+ "peu",
88
+ "beaucoup",
89
+ "quelques",
90
+ "plusieurs",
91
+ "quelqu'un",
92
+ "tout le monde",
93
+ "personne",
94
+ "chacun",
95
+ "n'importe qui",
96
+ "autre",
97
+ "l'autre",
98
+ "chaque",
99
+ "ni l'un ni l'autre",
100
+ "les deux",
101
+ "d'autres",
102
+ ],
103
+ }