PyPI - batchalign - Versions diffs - 0.7.5a7__tar.gz → 0.7.6a0__tar.gz - Mend

batchalign 0.7.5a7tar.gz → 0.7.6a0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

{batchalign-0.7.5a7/batchalign.egg-info → batchalign-0.7.6a0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: batchalign
-Version: 0.7.5a7
+Version: 0.7.6a0
 Summary: Python Speech Language Sample Analysis
 Author: Brian MacWhinney, Houjun Liu
 Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -22,7 +22,7 @@ Requires-Dist: plotly>=5.18.0
 Requires-Dist: transformers>=4.37
 Requires-Dist: tokenizers>=0.14.1
 Requires-Dist: pycountry>=22.3
-Requires-Dist: stanza>=1.7
+Requires-Dist: stanza[transformers]>=1.9.1
 Requires-Dist: scipy~=1.11
 Requires-Dist: rev_ai>=2.18.0
 Requires-Dist: rich~=13.6

{batchalign-0.7.5a7 → batchalign-0.7.6a0}/batchalign/cli/cli.py RENAMED Viewed

@@ -217,6 +217,27 @@ def morphotag(ctx, in_dir, out_dir, **kwargs):
               loader, writer, C)
+#################### MORPHOTAG ################################
+@batchalign.command(hidden=True)
+@common_options
+@click.pass_context
+def coref(ctx, in_dir, out_dir, **kwargs):
+    """Perform coreference analysis on transcripts."""
+    def loader(file):
+        cf = CHATFile(path=os.path.abspath(file))
+        doc = cf.doc
+        return doc, {}
+    def writer(doc, output):
+        CHATFile(doc=doc).write(output)
+    _dispatch("coref", "eng", 1, ["cha"], ctx,
+              in_dir, out_dir,
+              loader, writer, C)
 #################### UTSEG ################################
 @batchalign.command()

{batchalign-0.7.5a7 → batchalign-0.7.6a0}/batchalign/cli/dispatch.py RENAMED Viewed

@@ -47,6 +47,7 @@ Cmd2Task = {
     "morphotag": "morphosyntax",
     "benchmark": "asr,eval",
     "utseg": "utterance",
+    "coref": "coref",
 }
 # this is the main runner used by all functions

{batchalign-0.7.5a7 → batchalign-0.7.6a0}/batchalign/document.py RENAMED Viewed

@@ -29,7 +29,8 @@ class Task(IntEnum):
     FORCED_ALIGNMENT = 9
     FEATURE_EXTRACT = 10
     MORPHOSYNTAX = 11
-    WER = 12
+    COREF = 12
+    WER = 13
     DEBUG__G = 0
@@ -51,6 +52,7 @@ TypeMap = {
     Task.FEATURE_EXTRACT: TaskType.ANALYSIS,
     Task.RETRACE_ANALYSIS: TaskType.PROCESSING,
     Task.DISFLUENCY_ANALYSIS: TaskType.PROCESSING,
+    Task.COREF: TaskType.PROCESSING,
     Task.WER: TaskType.ANALYSIS,
     Task.DEBUG__G: TaskType.GENERATION,
@@ -69,6 +71,7 @@ TaskFriendlyName = {
     Task.FEATURE_EXTRACT: "Feature Extraction",
     Task.RETRACE_ANALYSIS:  "Retrace Analysis",
     Task.DISFLUENCY_ANALYSIS:  "Disfluncy Analysis",
+    Task.COREF:  "Coreference Resolution",
     Task.WER:  "Word Error Rate",
     Task.DEBUG__G:  "TEST_GENERATION",
     Task.DEBUG__P:  "TEST_PROCESSING",
@@ -103,12 +106,18 @@ class Morphology(BaseModel):
     pos: str # pos like "pron"
     feats: str # string feats "Dem-Acc-S1"
+class Coref(BaseModel):
+    start: bool
+    end: bool
+    chain: int
 class Form(BaseModel):
     text: str # the text
     # MILISCEONDS
     time: Optional[Tuple[int, int]] = Field(default=None) # word bullet
     morphology: Optional[List[Morphology]] = Field(default=None) # mor
     dependency: Optional[List[Dependency]] = Field(default=None) # gra
+    coreference: Optional[List[Coref]] = Field(default=None) # gra
     type: TokenType = Field(default=TokenType.REGULAR) # whether the field is a regular word (i.e. not a filled pause, not a feature, not a retrace, etc.)
 class Tier(BaseModel):

{batchalign-0.7.5a7 → batchalign-0.7.6a0}/batchalign/formats/chat/generator.py RENAMED Viewed

@@ -33,6 +33,8 @@ def generate_chat_utterance(utterance: Utterance, special_mor=False, write_wor=T
     gras = []
     has_wor = False
     wor_elems = []
+    has_coref = False
+    coref_elems = []
     for i in utterance.content:
         mors.append(i.morphology)
@@ -43,6 +45,21 @@ def generate_chat_utterance(utterance: Utterance, special_mor=False, write_wor=T
         else:
             wor_elems.append(i.text)
+        if i.coreference:
+            has_coref = True
+            coref_str_form = ""
+            for j in i.coreference:
+                coref_str = ""
+                if j.start:
+                    coref_str += "("
+                coref_str += str(j.chain)
+                if j.end:
+                    coref_str += ")"
+                coref_str_form += coref_str
+            coref_elems.append(coref_str_form)
+        else:
+            coref_elems.append("-")
         if bool(mors[-1]) != bool(gras[-1]):
             warnings.warn(f"Batchalign has detected a mismatch between lengths of mor and gra tiers for utterance; output will not pass CHATTER; line='{main_line}'")
@@ -75,6 +92,9 @@ def generate_chat_utterance(utterance: Utterance, special_mor=False, write_wor=T
     #### WOR LINE GENERATION ####
     if has_wor and write_wor:
         result.append("%wor:\t"+" ".join(wor_elems))
+    if has_coref:
+        result.append("%coref:\t"+" ".join(coref_elems))
     #### EXTRA LINE GENERATION ####

{batchalign-0.7.5a7 → batchalign-0.7.6a0}/batchalign/models/resolve.py RENAMED Viewed

@@ -12,6 +12,7 @@ resolver = {
     },
     "whisper": {
         'eng': ("talkbank/CHATWhisper-en-large-v1", "openai/whisper-large-v2"),
+        'yue': ("alvanlii/whisper-small-cantonese", "alvanlii/whisper-small-cantonese"),
     }
 }

{batchalign-0.7.5a7 → batchalign-0.7.6a0}/batchalign/pipelines/__init__.py RENAMED Viewed

@@ -2,7 +2,7 @@ from .pipeline import BatchalignPipeline
 from .base import BatchalignEngine
 from .asr import WhisperEngine, RevEngine, WhisperXEngine
-from .morphosyntax import StanzaEngine
+from .morphosyntax import StanzaEngine, CorefEngine
 from .cleanup import NgramRetraceEngine, DisfluencyReplacementEngine
 from .speaker import NemoSpeakerEngine

{batchalign-0.7.5a7 → batchalign-0.7.6a0}/batchalign/pipelines/dispatch.py RENAMED Viewed

@@ -6,7 +6,7 @@ Tabulate default packages and options.
 from batchalign import (WhisperEngine, WhisperFAEngine, StanzaEngine, RevEngine,
                         NgramRetraceEngine, DisfluencyReplacementEngine, WhisperUTREngine,
                         RevUTREngine, EvaluationEngine, WhisperXEngine, NemoSpeakerEngine,
-                        StanzaUtteranceEngine)
+                        StanzaUtteranceEngine, CorefEngine)
 from batchalign import BatchalignPipeline
 from batchalign.models import resolve
@@ -27,6 +27,7 @@ DEFAULT_PACKAGES = {
     "retracing": "ngram",
     "eval": "evaluation",
     "utterance": "stanza_utt",
+    "coref": "stanza_coref",
 }
 LANGUAGE_OVERRIDE_PACKAGES = {
@@ -124,6 +125,9 @@ def dispatch_pipeline(pkg_str, lang, num_speakers=None, **arg_overrides):
             engines.append(NemoSpeakerEngine(num_speakers=num_speakers))
         elif engine == "stanza_utt":
             engines.append(StanzaUtteranceEngine())
+        elif engine == "stanza_coref":
+            engines.append(CorefEngine())
     L.debug(f"Done initalizing packages.")
     return BatchalignPipeline(*engines)

batchalign-0.7.6a0/batchalign/pipelines/morphosyntax/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .ud import StanzaEngine
+from .coref import CorefEngine

batchalign-0.7.6a0/batchalign/pipelines/morphosyntax/coref.py ADDED Viewed

@@ -0,0 +1,45 @@
+import stanza
+from batchalign.utils.dp import PayloadTarget, ReferenceTarget, Match, align
+from warnings import warn
+from batchalign.document import *
+from batchalign.constants import *
+from batchalign.pipelines.base import *
+from batchalign.formats.chat.parser import chat_parse_utterance
+from batchalign.utils.dp import *
+class CorefEngine(BatchalignEngine):
+    tasks = [ Task.COREF ]
+    def process(self, doc, **kwargs):
+        if "eng" not in doc.langs:
+            warn("Coreference resolution is only supported for English documents.")
+            return
+        detokenized = " ".join([i.strip(include_retrace=True, include_fp=True) for i in doc.content if isinstance(i, Utterance)])
+        pipeline = stanza.Pipeline(lang="en", processors="tokenize, coref")
+        coref_chains = pipeline(detokenized).sentences
+        coref_chains = [(j.text,
+                        [Coref(start=chain.is_start,
+                                end=chain.is_end,
+                                chain=chain.chain.index) for chain in j.coref_chains])
+                        for i in coref_chains
+                        for j in i.words]
+        payloads = [PayloadTarget(i[0], i[1]) for i in coref_chains]
+        references = [ReferenceTarget(j.text, (ut_id, form_id)) for ut_id, i in enumerate(doc.content)
+                        if isinstance(i, Utterance)
+                        for form_id, j in enumerate(i.content)]
+        alignment = align(payloads, references, tqdm=False)
+        for i in alignment:
+            if isinstance(i, Match):
+                (ut, form) = i.reference_payload
+                doc.content[ut].content[form].coreference = i.payload
+        return doc

batchalign-0.7.6a0/batchalign/version ADDED Viewed

@@ -0,0 +1,3 @@
+0.7.6-alpha.0
+September 27th, 2024
+initial coreference support

{batchalign-0.7.5a7 → batchalign-0.7.6a0/batchalign.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: batchalign
-Version: 0.7.5a7
+Version: 0.7.6a0
 Summary: Python Speech Language Sample Analysis
 Author: Brian MacWhinney, Houjun Liu
 Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -22,7 +22,7 @@ Requires-Dist: plotly>=5.18.0
 Requires-Dist: transformers>=4.37
 Requires-Dist: tokenizers>=0.14.1
 Requires-Dist: pycountry>=22.3
-Requires-Dist: stanza>=1.7
+Requires-Dist: stanza[transformers]>=1.9.1
 Requires-Dist: scipy~=1.11
 Requires-Dist: rev_ai>=2.18.0
 Requires-Dist: rich~=13.6

{batchalign-0.7.5a7 → batchalign-0.7.6a0}/batchalign.egg-info/SOURCES.txt RENAMED Viewed

@@ -70,6 +70,7 @@ batchalign/pipelines/cleanup/support/test.test
 batchalign/pipelines/fa/__init__.py
 batchalign/pipelines/fa/whisper_fa.py
 batchalign/pipelines/morphosyntax/__init__.py
+batchalign/pipelines/morphosyntax/coref.py
 batchalign/pipelines/morphosyntax/ud.py
 batchalign/pipelines/morphosyntax/fr/case.py
 batchalign/pipelines/morphosyntax/ja/verbforms.py

{batchalign-0.7.5a7 → batchalign-0.7.6a0}/batchalign.egg-info/requires.txt RENAMED Viewed

@@ -12,7 +12,7 @@ plotly>=5.18.0
 transformers>=4.37
 tokenizers>=0.14.1
 pycountry>=22.3
-stanza>=1.7
+stanza[transformers]>=1.9.1
 scipy~=1.11
 rev_ai>=2.18.0
 rich~=13.6

{batchalign-0.7.5a7 → batchalign-0.7.6a0}/setup.py RENAMED Viewed

@@ -40,7 +40,7 @@ setup(
         "transformers>=4.37",
         "tokenizers>=0.14.1",
         "pycountry>=22.3",
-        "stanza>=1.7",
+        "stanza[transformers]>=1.9.1",
         "scipy~=1.11",
         "rev_ai>=2.18.0",
         "rich~=13.6",