PyPI - speechmetryflow - Versions diffs - 0.2.2__py3-none-any.whl - Mend

speechmetryflow 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

speechmetryflow/__init__.py +25 -0
speechmetryflow/_version.py +34 -0
speechmetryflow/cli.py +131 -0
speechmetryflow/lexical/__init__.py +17 -0
speechmetryflow/lexical/assets/__init__.py +103 -0
speechmetryflow/lexical/assets/concreteness.tsv +39955 -0
speechmetryflow/lexical/assets/familiarity_imageability.tsv +5554 -0
speechmetryflow/lexical/assets/frequency.tsv +74289 -0
speechmetryflow/lexical/assets/valence.tsv +13916 -0
speechmetryflow/lexical/database.py +143 -0
speechmetryflow/lexical/part_of_speech.py +172 -0
speechmetryflow/lexical/utils.py +119 -0
speechmetryflow/pragmatic/__init__.py +60 -0
speechmetryflow/pragmatic/assets.py +62 -0
speechmetryflow/pragmatic/database.py +113 -0
speechmetryflow/semantic/__init__.py +23 -0
speechmetryflow/semantic/assets.py +285 -0
speechmetryflow/semantic/icu.py +33 -0
speechmetryflow/semantic/idea_density.py +78 -0
speechmetryflow/speech_production/__init__.py +62 -0
speechmetryflow/speech_production/assets/__init__.py +58 -0
speechmetryflow/speech_production/assets/words_fr.json +1 -0
speechmetryflow/speech_production/fluency.py +42 -0
speechmetryflow/speech_production/fragment.py +76 -0
speechmetryflow/syntactic/__init__.py +119 -0
speechmetryflow/syntactic/assets.py +40 -0
speechmetryflow/syntactic/dependency.py +47 -0
speechmetryflow/syntactic/sentences.py +189 -0
speechmetryflow/ucsf_disfluency/__init__.py +23 -0
speechmetryflow/utils.py +53 -0
speechmetryflow-0.2.2.dist-info/METADATA +764 -0
speechmetryflow-0.2.2.dist-info/RECORD +35 -0
speechmetryflow-0.2.2.dist-info/WHEEL +4 -0
speechmetryflow-0.2.2.dist-info/entry_points.txt +3 -0
speechmetryflow-0.2.2.dist-info/licenses/LICENSE +674 -0

speechmetryflow/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+try:
+    from speechmetryflow._version import __version__
+except ImportError:
+    pass
+from speechmetryflow import (
+    lexical,
+    semantic,
+    speech_production,
+    syntactic,
+    pragmatic,
+    ucsf_disfluency,
+    utils,
+)
+__all__ = [
+    "__version__",
+    "lexical",
+    "semantic",
+    "speech_production",
+    "syntactic",
+    "pragmatic",
+    "ucsf_disfluency",
+    "utils",
+]

speechmetryflow/_version.py ADDED Viewed

@@ -0,0 +1,34 @@
+# file generated by setuptools-scm
+# don't change, don't track in version control
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "version",
+    "version_tuple",
+    "__commit_id__",
+    "commit_id",
+]
+TYPE_CHECKING = False
+if TYPE_CHECKING:
+    from typing import Tuple
+    from typing import Union
+    VERSION_TUPLE = Tuple[Union[int, str], ...]
+    COMMIT_ID = Union[str, None]
+else:
+    VERSION_TUPLE = object
+    COMMIT_ID = object
+version: str
+__version__: str
+__version_tuple__: VERSION_TUPLE
+version_tuple: VERSION_TUPLE
+commit_id: COMMIT_ID
+__commit_id__: COMMIT_ID
+__version__ = version = '0.2.2'
+__version_tuple__ = version_tuple = (0, 2, 2)
+__commit_id__ = commit_id = None

speechmetryflow/cli.py ADDED Viewed

@@ -0,0 +1,131 @@
+""""""
+from typing import Dict
+import click
+import json
+import spacy
+from textdescriptives.utils import _download_spacy_model
+from textdescriptives.extractors import extract_dict
+from pathlib import Path
+import speechmetryflow as smf
+from speechmetryflow.utils import compute_stemming
+@click.command()
+@click.option("--lang", "-l", default="en")
+@click.option("--task", "-t")
+@click.option("--output", "-o", type=click.Path(dir_okay=False))
+@click.option("--clobber", is_flag=True)
+@click.argument("text_path", type=click.Path(exists=True, dir_okay=False))
+def extract(
+    text_path: Path,
+    lang: str = None,
+    task: str = None,
+    output: Path = None,
+    clobber: bool = False,
+) -> Dict:
+    # Read input text_path
+    text_path = Path(text_path)
+    with text_path.open("r") as txt:
+        raw_text = " ".join(txt.read().split("\n"))
+    # Output
+    if output is None:
+        output = Path(f"{text_path.stem}_metrics-speechmetryflow.json")
+    else:
+        output = Path(output)
+    # Check if output file already exists
+    if output.exists() and not clobber:
+        raise FileExistsError(
+            f"File {output} already exists, use --clobber to overwrite."
+        )
+    # Disfluency metrics from UCSF
+    ucsf_disfluency_metrics = smf.ucsf_disfluency.metrics(raw_text)
+    text = smf.ucsf_disfluency.cleaning(raw_text)
+    # General cleaning
+    text = smf.utils.general_cleaning(text)
+    # TextDescriptives pipeline
+    spacy_model = _download_spacy_model(lang, "lg")
+    nlp = spacy.load(spacy_model)
+    nlp.add_pipe("textdescriptives/all")
+    doc = nlp(text)
+    tokens_filtered = [token for token in doc if not token.is_punct]
+    texts_filtered = [token.text for token in tokens_filtered]
+    lemmas_filtered = [token.lemma_ for token in tokens_filtered]
+    stems_filtered = compute_stemming(texts_filtered, lang)
+    # metrics containers
+    smf_metrics = {
+        "filename": text_path.name,
+        "participant_id": text_path.name.split("_")[0],
+        "language": lang,
+        "spacy_model": spacy_model,
+    }
+    smf_metrics.update(ucsf_disfluency_metrics)
+    td_metrics = {
+        "filename": text_path.name,
+        "participant_id": text_path.name.split("_")[0],
+        "language": lang,
+        "spacy_model": spacy_model,
+    }
+    td_metrics.update(extract_dict(doc)[0])
+    # speech_production
+    speech_production_metrics = smf.speech_production.metrics(
+        raw_text, texts_filtered, lemmas_filtered, lang, task
+    )
+    smf_metrics.update(speech_production_metrics)
+    # lexical
+    lexical_metrics = smf.lexical.metrics(doc, texts_filtered, stems_filtered, lang)
+    smf_metrics.update(lexical_metrics)
+    # semantic
+    semantic_metrics = smf.semantic.metrics(
+        raw_text, tokens_filtered, lang, task, speech_production_metrics["n_tokens"]
+    )
+    smf_metrics.update(semantic_metrics)
+    # syntactic
+    syntactic_metrics = smf.syntactic.metrics(doc, lang)
+    smf_metrics.update(syntactic_metrics)
+    # pragmatic
+    pragmatic_metrics = smf.pragmatic.metrics(
+        doc, lang, speech_production_metrics["n_tokens"]
+    )
+    smf_metrics.update(pragmatic_metrics)
+    # write output
+    with output.open("w", encoding="utf-8") as f:
+        json.dump(smf_metrics, f, indent=4)
+    output_td = Path(f"{text_path.stem}_metrics-textdescriptives.json")
+    with output_td.open("w", encoding="utf-8") as f:
+        json.dump(td_metrics, f, indent=4)
+@click.command()
+def download():
+    import nltk
+    nltk.download("words")
+    _download_spacy_model("en", "lg")
+    _download_spacy_model("fr", "lg")
+    from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline
+    AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion", legacy=False)
+    AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-emotion")
+    model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
+    pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)

speechmetryflow/lexical/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+from speechmetryflow.lexical.part_of_speech import POS
+from speechmetryflow.lexical.utils import (
+    count_deictic_pronouns,
+    count_indefinite_terms,
+    compute_honore_r_stat,
+    compute_brunet_index,
+)
+def metrics(doc, tokens, stems, lang):
+    pos = POS(doc, lang)
+    metrics = pos.metrics
+    metrics.update(count_deictic_pronouns(tokens, lang))
+    metrics.update(count_indefinite_terms(tokens, lang))
+    metrics["honore_r_stat"] = compute_honore_r_stat(stems)
+    metrics["brunet_w_index"] = compute_brunet_index(stems)
+    return metrics

speechmetryflow/lexical/assets/__init__.py ADDED Viewed

@@ -0,0 +1,103 @@
+# Correspondence between English and French POS labels
+pos_mapping = {
+    "ADJ": "Adjectif",
+    "ADP": "Preposition",
+    "ADV": "Adverbe",
+    "AUX": "Auxiliaire",
+    "CONJ": "Conjonction",
+    "CCONJ": "Conjonction_de_coordination",
+    "DET": "Determinant",
+    "INTJ": "Interjection",
+    "NOUN": "Nom",
+    "NUM": "Numeral",
+    "PART": "Particule",
+    "PRON": "Pronom",
+    "PROPN": "Nom propre",
+    "PUNCT": "Ponctuation",
+    "SCONJ": "Conjonction_de_subordination",
+    "SYM": "Symbole",
+    "VERB": "Verbe",
+    "X": "Autre",
+}
+# Dictionary of deictic pronouns for each language
+deictic_pronouns = {
+    "en": {
+        "spatial": {"here", "there", "this", "these", "that", "those"},
+        "personal": {"i", "you", "he", "she", "it", "we", "they"},
+        "temporal": {"now", "then", "soon", "tomorrow"},
+    },
+    "fr": {
+        "spatial": {
+            "ce",
+            "cet",
+            "cette",
+            "ces",
+            "celui-ci",
+            "celle-ci",
+            "ceux-ci",
+            "celles-ci",
+            "celui-là",
+            "celle-là",
+            "ceux-là",
+            "celles-là",
+            "y",
+            "en",
+        },
+        "personal": {"je", "tu", "il", "elle", "nous", "vous", "ils", "elles"},
+        "temporal": {"y", "en"},
+    },
+}
+# List of undefined terms in English and French
+indefinite_terms = {
+    "en": [
+        "thing",
+        "stuff",
+        "anything",
+        "nothing",
+        "anyone",
+        "one",
+        "either",
+        "neither",
+        "everyone",
+        "no one",
+        "someone",
+        "anybody",
+        "everybody",
+        "nobody",
+        "somebody",
+        "another",
+        "the other",
+        "each",
+        "little",
+        "less",
+        "much",
+        "both",
+        "few",
+        "fewer",
+        "many",
+        "other",
+        "others",
+        "several",
+    ],
+    "fr": [
+        "truc",
+        "chose",
+        "peu",
+        "beaucoup",
+        "quelques",
+        "plusieurs",
+        "quelqu'un",
+        "tout le monde",
+        "personne",
+        "chacun",
+        "n'importe qui",
+        "autre",
+        "l'autre",
+        "chaque",
+        "ni l'un ni l'autre",
+        "les deux",
+        "d'autres",
+    ],
+}