farahidi 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- farahidi/__init__.py +57 -0
- farahidi/analyzer.py +728 -0
- farahidi/cli.py +184 -0
- farahidi/clitics.py +150 -0
- farahidi/data/DATA.Clitics.Enclitics.map.jsonl.gz +0 -0
- farahidi/data/DATA.Clitics.Proclitics.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.CaseOrMood.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.Formulas.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.PartOfSpeech.Definit.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.PartOfSpeech.Gender.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.PartOfSpeech.Main.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.PartOfSpeech.NbRoot.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.PartOfSpeech.Number.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.PartOfSpeech.Number2.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.PartOfSpeech.Type.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.PartOfSpeech.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.Patterns.Lemmas.Voweled.Canonic.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.Patterns.Lemmas.Voweled.Diac.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.Patterns.Stems.Unvoweled.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.Patterns.Stems.Voweled.Canonic.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.Patterns.Stems.Voweled.Diac.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.Roots.Quadriliteral.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.Roots.Trilateral.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.Roots.id.Quadriliteral.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Nouns.Roots.id.Trilateral.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.CaseOrMood.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.Formulas.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.PartOfSpeech.Augmented.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.PartOfSpeech.Emphasized.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.PartOfSpeech.Emphasized2.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.PartOfSpeech.Main.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.PartOfSpeech.NbRoot.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.PartOfSpeech.Person.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.PartOfSpeech.Person2.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.PartOfSpeech.Transitivity.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.PartOfSpeech.Type.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.PartOfSpeech.Voice.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.PartOfSpeech.list.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.Patterns.Lemmas.Voweled.Canonic.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.Patterns.Lemmas.Voweled.Diac.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.Patterns.Stems.Unvoweled.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.Patterns.Stems.Voweled.Canonic.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.Patterns.Stems.Voweled.Diac.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.Roots.Quadriliteral.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.Roots.Trilateral.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.Roots.id.Quadriliteral.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.Roots.id.Quadriliteral2.map.jsonl.gz +0 -0
- farahidi/data/DATA.Derived.Verbs.Roots.id.Trilateral.map.jsonl.gz +0 -0
- farahidi/data/DATA.Exceptional.map.jsonl.gz +0 -0
- farahidi/data/DATA.MSA-LEMMA.ALL-train.map.jsonl.gz +0 -0
- farahidi/data/DATA.MSA.ALL.TRAIN.141809.lm.gz +0 -0
- farahidi/data/DATA.MSA.SHA.ROOT.map.jsonl.gz +0 -0
- farahidi/data/DATA.MSA.SHA.STEM.map.jsonl.gz +0 -0
- farahidi/data/DATA.NonDerived.Propernoun.CaseOrMood.list.jsonl.gz +0 -0
- farahidi/data/DATA.NonDerived.Propernoun.PartOfSpeech.list.jsonl.gz +0 -0
- farahidi/data/DATA.NonDerived.Propernoun.Unvoweled.map.jsonl.gz +0 -0
- farahidi/data/DATA.NonDerived.Propernoun.Voweled.list.jsonl.gz +0 -0
- farahidi/data/DATA.NonDerived.Toolwords.PartOfSpeech.list.jsonl.gz +0 -0
- farahidi/data/DATA.NonDerived.Toolwords.Unvoweled.map.jsonl.gz +0 -0
- farahidi/data/DATA.NonDerived.Toolwords.Voweled.list.jsonl.gz +0 -0
- farahidi/data/DATA.Root.map.jsonl.gz +0 -0
- farahidi/data/MANIFEST.txt +58 -0
- farahidi/disambiguate.py +238 -0
- farahidi/lexicon.py +192 -0
- farahidi/lm.py +133 -0
- farahidi/models.py +54 -0
- farahidi/normalize.py +388 -0
- farahidi/pos.py +73 -0
- farahidi/translit.py +86 -0
- farahidi/vowelize.py +89 -0
- farahidi-0.3.0.dist-info/METADATA +191 -0
- farahidi-0.3.0.dist-info/RECORD +76 -0
- farahidi-0.3.0.dist-info/WHEEL +4 -0
- farahidi-0.3.0.dist-info/entry_points.txt +2 -0
- farahidi-0.3.0.dist-info/licenses/LICENSE +674 -0
- farahidi-0.3.0.dist-info/licenses/NOTICE +28 -0
farahidi/__init__.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""farahidi — Arabic morphological analyzer (a pure-Python port of AlKhalil Morpho Sys 2).
|
|
2
|
+
|
|
3
|
+
Quick start::
|
|
4
|
+
|
|
5
|
+
import farahidi
|
|
6
|
+
|
|
7
|
+
for a in farahidi.analyze("لأنهم"):
|
|
8
|
+
print(a.voweled_word, a.lemma, a.root, a.part_of_speech)
|
|
9
|
+
|
|
10
|
+
:func:`analyze` returns a list of :class:`Analysis` candidates sorted by
|
|
11
|
+
``priority`` (most frequent first). For repeated use, build an :class:`Analyzer`
|
|
12
|
+
once and reuse it — the bundled lexicon loads lazily and is shared across calls.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from functools import lru_cache
|
|
18
|
+
|
|
19
|
+
from .analyzer import Analyzer
|
|
20
|
+
from .disambiguate import Disambiguator
|
|
21
|
+
from .models import Analysis, TokenResult
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"Analyzer",
|
|
25
|
+
"Analysis",
|
|
26
|
+
"Disambiguator",
|
|
27
|
+
"TokenResult",
|
|
28
|
+
"analyze",
|
|
29
|
+
"analyze_text",
|
|
30
|
+
"__version__",
|
|
31
|
+
]
|
|
32
|
+
__version__ = "0.3.0"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@lru_cache(maxsize=1)
|
|
36
|
+
def _default_analyzer() -> Analyzer:
|
|
37
|
+
return Analyzer()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@lru_cache(maxsize=1)
|
|
41
|
+
def _default_disambiguator() -> Disambiguator:
|
|
42
|
+
return Disambiguator(analyzer=_default_analyzer())
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def analyze(word: str) -> list[Analysis]:
|
|
46
|
+
"""Return all analyses of ``word``, sorted by ``priority`` (descending).
|
|
47
|
+
|
|
48
|
+
Uses a shared module-level :class:`Analyzer`.
|
|
49
|
+
"""
|
|
50
|
+
return _default_analyzer().analyze(word)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def analyze_text(text: str) -> list[TokenResult]:
|
|
54
|
+
"""In-context analysis: one chosen lemma/stem/root per word token across the
|
|
55
|
+
sentence(s) in ``text``. Uses a shared module-level :class:`Disambiguator`.
|
|
56
|
+
"""
|
|
57
|
+
return _default_disambiguator().analyze_text(text)
|