speechmetryflow 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speechmetryflow/__init__.py +25 -0
- speechmetryflow/_version.py +34 -0
- speechmetryflow/cli.py +131 -0
- speechmetryflow/lexical/__init__.py +17 -0
- speechmetryflow/lexical/assets/__init__.py +103 -0
- speechmetryflow/lexical/assets/concreteness.tsv +39955 -0
- speechmetryflow/lexical/assets/familiarity_imageability.tsv +5554 -0
- speechmetryflow/lexical/assets/frequency.tsv +74289 -0
- speechmetryflow/lexical/assets/valence.tsv +13916 -0
- speechmetryflow/lexical/database.py +143 -0
- speechmetryflow/lexical/part_of_speech.py +172 -0
- speechmetryflow/lexical/utils.py +119 -0
- speechmetryflow/pragmatic/__init__.py +60 -0
- speechmetryflow/pragmatic/assets.py +62 -0
- speechmetryflow/pragmatic/database.py +113 -0
- speechmetryflow/semantic/__init__.py +23 -0
- speechmetryflow/semantic/assets.py +285 -0
- speechmetryflow/semantic/icu.py +33 -0
- speechmetryflow/semantic/idea_density.py +78 -0
- speechmetryflow/speech_production/__init__.py +62 -0
- speechmetryflow/speech_production/assets/__init__.py +58 -0
- speechmetryflow/speech_production/assets/words_fr.json +1 -0
- speechmetryflow/speech_production/fluency.py +42 -0
- speechmetryflow/speech_production/fragment.py +76 -0
- speechmetryflow/syntactic/__init__.py +119 -0
- speechmetryflow/syntactic/assets.py +40 -0
- speechmetryflow/syntactic/dependency.py +47 -0
- speechmetryflow/syntactic/sentences.py +189 -0
- speechmetryflow/ucsf_disfluency/__init__.py +23 -0
- speechmetryflow/utils.py +53 -0
- speechmetryflow-0.2.2.dist-info/METADATA +764 -0
- speechmetryflow-0.2.2.dist-info/RECORD +35 -0
- speechmetryflow-0.2.2.dist-info/WHEEL +4 -0
- speechmetryflow-0.2.2.dist-info/entry_points.txt +3 -0
- speechmetryflow-0.2.2.dist-info/licenses/LICENSE +674 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
try:
|
|
2
|
+
from speechmetryflow._version import __version__
|
|
3
|
+
except ImportError:
|
|
4
|
+
pass
|
|
5
|
+
|
|
6
|
+
from speechmetryflow import (
|
|
7
|
+
lexical,
|
|
8
|
+
semantic,
|
|
9
|
+
speech_production,
|
|
10
|
+
syntactic,
|
|
11
|
+
pragmatic,
|
|
12
|
+
ucsf_disfluency,
|
|
13
|
+
utils,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"__version__",
|
|
18
|
+
"lexical",
|
|
19
|
+
"semantic",
|
|
20
|
+
"speech_production",
|
|
21
|
+
"syntactic",
|
|
22
|
+
"pragmatic",
|
|
23
|
+
"ucsf_disfluency",
|
|
24
|
+
"utils",
|
|
25
|
+
]
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# file generated by setuptools-scm
|
|
2
|
+
# don't change, don't track in version control
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"__version__",
|
|
6
|
+
"__version_tuple__",
|
|
7
|
+
"version",
|
|
8
|
+
"version_tuple",
|
|
9
|
+
"__commit_id__",
|
|
10
|
+
"commit_id",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
TYPE_CHECKING = False
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from typing import Tuple
|
|
16
|
+
from typing import Union
|
|
17
|
+
|
|
18
|
+
VERSION_TUPLE = Tuple[Union[int, str], ...]
|
|
19
|
+
COMMIT_ID = Union[str, None]
|
|
20
|
+
else:
|
|
21
|
+
VERSION_TUPLE = object
|
|
22
|
+
COMMIT_ID = object
|
|
23
|
+
|
|
24
|
+
version: str
|
|
25
|
+
__version__: str
|
|
26
|
+
__version_tuple__: VERSION_TUPLE
|
|
27
|
+
version_tuple: VERSION_TUPLE
|
|
28
|
+
commit_id: COMMIT_ID
|
|
29
|
+
__commit_id__: COMMIT_ID
|
|
30
|
+
|
|
31
|
+
__version__ = version = '0.2.2'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 2, 2)
|
|
33
|
+
|
|
34
|
+
__commit_id__ = commit_id = None
|
speechmetryflow/cli.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
""""""
|
|
2
|
+
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
import click
|
|
6
|
+
import json
|
|
7
|
+
import spacy
|
|
8
|
+
|
|
9
|
+
from textdescriptives.utils import _download_spacy_model
|
|
10
|
+
from textdescriptives.extractors import extract_dict
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import speechmetryflow as smf
|
|
14
|
+
from speechmetryflow.utils import compute_stemming
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@click.command()
|
|
18
|
+
@click.option("--lang", "-l", default="en")
|
|
19
|
+
@click.option("--task", "-t")
|
|
20
|
+
@click.option("--output", "-o", type=click.Path(dir_okay=False))
|
|
21
|
+
@click.option("--clobber", is_flag=True)
|
|
22
|
+
@click.argument("text_path", type=click.Path(exists=True, dir_okay=False))
|
|
23
|
+
def extract(
|
|
24
|
+
text_path: Path,
|
|
25
|
+
lang: str = None,
|
|
26
|
+
task: str = None,
|
|
27
|
+
output: Path = None,
|
|
28
|
+
clobber: bool = False,
|
|
29
|
+
) -> Dict:
|
|
30
|
+
# Read input text_path
|
|
31
|
+
text_path = Path(text_path)
|
|
32
|
+
with text_path.open("r") as txt:
|
|
33
|
+
raw_text = " ".join(txt.read().split("\n"))
|
|
34
|
+
|
|
35
|
+
# Output
|
|
36
|
+
if output is None:
|
|
37
|
+
output = Path(f"{text_path.stem}_metrics-speechmetryflow.json")
|
|
38
|
+
else:
|
|
39
|
+
output = Path(output)
|
|
40
|
+
|
|
41
|
+
# Check if output file already exists
|
|
42
|
+
if output.exists() and not clobber:
|
|
43
|
+
raise FileExistsError(
|
|
44
|
+
f"File {output} already exists, use --clobber to overwrite."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
# Disfluency metrics from UCSF
|
|
48
|
+
ucsf_disfluency_metrics = smf.ucsf_disfluency.metrics(raw_text)
|
|
49
|
+
text = smf.ucsf_disfluency.cleaning(raw_text)
|
|
50
|
+
|
|
51
|
+
# General cleaning
|
|
52
|
+
text = smf.utils.general_cleaning(text)
|
|
53
|
+
|
|
54
|
+
# TextDescriptives pipeline
|
|
55
|
+
spacy_model = _download_spacy_model(lang, "lg")
|
|
56
|
+
nlp = spacy.load(spacy_model)
|
|
57
|
+
nlp.add_pipe("textdescriptives/all")
|
|
58
|
+
doc = nlp(text)
|
|
59
|
+
|
|
60
|
+
tokens_filtered = [token for token in doc if not token.is_punct]
|
|
61
|
+
texts_filtered = [token.text for token in tokens_filtered]
|
|
62
|
+
lemmas_filtered = [token.lemma_ for token in tokens_filtered]
|
|
63
|
+
stems_filtered = compute_stemming(texts_filtered, lang)
|
|
64
|
+
|
|
65
|
+
# metrics containers
|
|
66
|
+
smf_metrics = {
|
|
67
|
+
"filename": text_path.name,
|
|
68
|
+
"participant_id": text_path.name.split("_")[0],
|
|
69
|
+
"language": lang,
|
|
70
|
+
"spacy_model": spacy_model,
|
|
71
|
+
}
|
|
72
|
+
smf_metrics.update(ucsf_disfluency_metrics)
|
|
73
|
+
|
|
74
|
+
td_metrics = {
|
|
75
|
+
"filename": text_path.name,
|
|
76
|
+
"participant_id": text_path.name.split("_")[0],
|
|
77
|
+
"language": lang,
|
|
78
|
+
"spacy_model": spacy_model,
|
|
79
|
+
}
|
|
80
|
+
td_metrics.update(extract_dict(doc)[0])
|
|
81
|
+
|
|
82
|
+
# speech_production
|
|
83
|
+
speech_production_metrics = smf.speech_production.metrics(
|
|
84
|
+
raw_text, texts_filtered, lemmas_filtered, lang, task
|
|
85
|
+
)
|
|
86
|
+
smf_metrics.update(speech_production_metrics)
|
|
87
|
+
|
|
88
|
+
# lexical
|
|
89
|
+
lexical_metrics = smf.lexical.metrics(doc, texts_filtered, stems_filtered, lang)
|
|
90
|
+
smf_metrics.update(lexical_metrics)
|
|
91
|
+
|
|
92
|
+
# semantic
|
|
93
|
+
semantic_metrics = smf.semantic.metrics(
|
|
94
|
+
raw_text, tokens_filtered, lang, task, speech_production_metrics["n_tokens"]
|
|
95
|
+
)
|
|
96
|
+
smf_metrics.update(semantic_metrics)
|
|
97
|
+
|
|
98
|
+
# syntactic
|
|
99
|
+
syntactic_metrics = smf.syntactic.metrics(doc, lang)
|
|
100
|
+
smf_metrics.update(syntactic_metrics)
|
|
101
|
+
|
|
102
|
+
# pragmatic
|
|
103
|
+
pragmatic_metrics = smf.pragmatic.metrics(
|
|
104
|
+
doc, lang, speech_production_metrics["n_tokens"]
|
|
105
|
+
)
|
|
106
|
+
smf_metrics.update(pragmatic_metrics)
|
|
107
|
+
|
|
108
|
+
# write output
|
|
109
|
+
with output.open("w", encoding="utf-8") as f:
|
|
110
|
+
json.dump(smf_metrics, f, indent=4)
|
|
111
|
+
|
|
112
|
+
output_td = Path(f"{text_path.stem}_metrics-textdescriptives.json")
|
|
113
|
+
with output_td.open("w", encoding="utf-8") as f:
|
|
114
|
+
json.dump(td_metrics, f, indent=4)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@click.command()
|
|
118
|
+
def download():
|
|
119
|
+
import nltk
|
|
120
|
+
|
|
121
|
+
nltk.download("words")
|
|
122
|
+
|
|
123
|
+
_download_spacy_model("en", "lg")
|
|
124
|
+
_download_spacy_model("fr", "lg")
|
|
125
|
+
|
|
126
|
+
from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline
|
|
127
|
+
|
|
128
|
+
AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion", legacy=False)
|
|
129
|
+
AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-emotion")
|
|
130
|
+
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
|
|
131
|
+
pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from speechmetryflow.lexical.part_of_speech import POS
|
|
2
|
+
from speechmetryflow.lexical.utils import (
|
|
3
|
+
count_deictic_pronouns,
|
|
4
|
+
count_indefinite_terms,
|
|
5
|
+
compute_honore_r_stat,
|
|
6
|
+
compute_brunet_index,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def metrics(doc, tokens, stems, lang):
|
|
11
|
+
pos = POS(doc, lang)
|
|
12
|
+
metrics = pos.metrics
|
|
13
|
+
metrics.update(count_deictic_pronouns(tokens, lang))
|
|
14
|
+
metrics.update(count_indefinite_terms(tokens, lang))
|
|
15
|
+
metrics["honore_r_stat"] = compute_honore_r_stat(stems)
|
|
16
|
+
metrics["brunet_w_index"] = compute_brunet_index(stems)
|
|
17
|
+
return metrics
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Correspondence between English and French POS labels
|
|
2
|
+
pos_mapping = {
|
|
3
|
+
"ADJ": "Adjectif",
|
|
4
|
+
"ADP": "Preposition",
|
|
5
|
+
"ADV": "Adverbe",
|
|
6
|
+
"AUX": "Auxiliaire",
|
|
7
|
+
"CONJ": "Conjonction",
|
|
8
|
+
"CCONJ": "Conjonction_de_coordination",
|
|
9
|
+
"DET": "Determinant",
|
|
10
|
+
"INTJ": "Interjection",
|
|
11
|
+
"NOUN": "Nom",
|
|
12
|
+
"NUM": "Numeral",
|
|
13
|
+
"PART": "Particule",
|
|
14
|
+
"PRON": "Pronom",
|
|
15
|
+
"PROPN": "Nom propre",
|
|
16
|
+
"PUNCT": "Ponctuation",
|
|
17
|
+
"SCONJ": "Conjonction_de_subordination",
|
|
18
|
+
"SYM": "Symbole",
|
|
19
|
+
"VERB": "Verbe",
|
|
20
|
+
"X": "Autre",
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
# Dictionary of deictic pronouns for each language
|
|
24
|
+
deictic_pronouns = {
|
|
25
|
+
"en": {
|
|
26
|
+
"spatial": {"here", "there", "this", "these", "that", "those"},
|
|
27
|
+
"personal": {"i", "you", "he", "she", "it", "we", "they"},
|
|
28
|
+
"temporal": {"now", "then", "soon", "tomorrow"},
|
|
29
|
+
},
|
|
30
|
+
"fr": {
|
|
31
|
+
"spatial": {
|
|
32
|
+
"ce",
|
|
33
|
+
"cet",
|
|
34
|
+
"cette",
|
|
35
|
+
"ces",
|
|
36
|
+
"celui-ci",
|
|
37
|
+
"celle-ci",
|
|
38
|
+
"ceux-ci",
|
|
39
|
+
"celles-ci",
|
|
40
|
+
"celui-là",
|
|
41
|
+
"celle-là",
|
|
42
|
+
"ceux-là",
|
|
43
|
+
"celles-là",
|
|
44
|
+
"y",
|
|
45
|
+
"en",
|
|
46
|
+
},
|
|
47
|
+
"personal": {"je", "tu", "il", "elle", "nous", "vous", "ils", "elles"},
|
|
48
|
+
"temporal": {"y", "en"},
|
|
49
|
+
},
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
# List of undefined terms in English and French
|
|
53
|
+
indefinite_terms = {
|
|
54
|
+
"en": [
|
|
55
|
+
"thing",
|
|
56
|
+
"stuff",
|
|
57
|
+
"anything",
|
|
58
|
+
"nothing",
|
|
59
|
+
"anyone",
|
|
60
|
+
"one",
|
|
61
|
+
"either",
|
|
62
|
+
"neither",
|
|
63
|
+
"everyone",
|
|
64
|
+
"no one",
|
|
65
|
+
"someone",
|
|
66
|
+
"anybody",
|
|
67
|
+
"everybody",
|
|
68
|
+
"nobody",
|
|
69
|
+
"somebody",
|
|
70
|
+
"another",
|
|
71
|
+
"the other",
|
|
72
|
+
"each",
|
|
73
|
+
"little",
|
|
74
|
+
"less",
|
|
75
|
+
"much",
|
|
76
|
+
"both",
|
|
77
|
+
"few",
|
|
78
|
+
"fewer",
|
|
79
|
+
"many",
|
|
80
|
+
"other",
|
|
81
|
+
"others",
|
|
82
|
+
"several",
|
|
83
|
+
],
|
|
84
|
+
"fr": [
|
|
85
|
+
"truc",
|
|
86
|
+
"chose",
|
|
87
|
+
"peu",
|
|
88
|
+
"beaucoup",
|
|
89
|
+
"quelques",
|
|
90
|
+
"plusieurs",
|
|
91
|
+
"quelqu'un",
|
|
92
|
+
"tout le monde",
|
|
93
|
+
"personne",
|
|
94
|
+
"chacun",
|
|
95
|
+
"n'importe qui",
|
|
96
|
+
"autre",
|
|
97
|
+
"l'autre",
|
|
98
|
+
"chaque",
|
|
99
|
+
"ni l'un ni l'autre",
|
|
100
|
+
"les deux",
|
|
101
|
+
"d'autres",
|
|
102
|
+
],
|
|
103
|
+
}
|