polystring 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polystring/__init__.py +29 -0
- polystring/_analyzer.py +133 -0
- polystring/_detector.py +74 -0
- polystring/_exceptions.py +17 -0
- polystring/_models.py +106 -0
- polystring/_ngram.py +144 -0
- polystring/_pipeline/__init__.py +0 -0
- polystring/_pipeline/stage1_preprocess.py +134 -0
- polystring/_pipeline/stage2_script.py +104 -0
- polystring/_pipeline/stage3_classify.py +176 -0
- polystring/_pipeline/stage4_context.py +108 -0
- polystring/_pipeline/stage5_merge.py +138 -0
- polystring/data/_background_ngram.json +1 -0
- polystring/data/sw_ngram.json +1 -0
- polystring/data/tl_ngram.json +1 -0
- polystring/data/ur_Latn_ngram.json +1 -0
- polystring/lexicons/__init__.py +116 -0
- polystring/lexicons/french.py +113 -0
- polystring/lexicons/german.py +111 -0
- polystring/lexicons/italian.py +113 -0
- polystring/lexicons/portuguese.py +117 -0
- polystring/lexicons/roman_urdu.py +130 -0
- polystring/lexicons/spanish.py +111 -0
- polystring/lexicons/swahili.py +89 -0
- polystring/lexicons/tagalog.py +100 -0
- polystring/lexicons/turkish.py +87 -0
- polystring/py.typed +0 -0
- polystring-0.1.0.dist-info/METADATA +257 -0
- polystring-0.1.0.dist-info/RECORD +31 -0
- polystring-0.1.0.dist-info/WHEEL +4 -0
- polystring-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
TAGALOG: set[str] = {
|
|
2
|
+
# Core markers — extremely diagnostic, unique to Tagalog
|
|
3
|
+
"mga", "ang", "ng", "ay", "nang",
|
|
4
|
+
"nina", "sina",
|
|
5
|
+
|
|
6
|
+
# Pronouns — nominative
|
|
7
|
+
"siya", "sila", "kami", "tayo", "kayo", "ikaw", "ako",
|
|
8
|
+
# Pronouns — genitive / oblique
|
|
9
|
+
"niya", "nila", "namin", "natin", "ninyo", "nyo", "niya",
|
|
10
|
+
"akin", "iyo", "kanya", "amin", "atin", "inyo",
|
|
11
|
+
"kanila", "kanilang", "aking", "iyong", "kaniyang",
|
|
12
|
+
|
|
13
|
+
# Demonstratives
|
|
14
|
+
"ito", "iyon", "iyan", "dito", "doon", "diyan",
|
|
15
|
+
"nito", "noon", "niyan", "rito", "roon",
|
|
16
|
+
"ngayon", "kahapon", "bukas",
|
|
17
|
+
|
|
18
|
+
# Negation / affirmation
|
|
19
|
+
"hindi", "huwag", "wag", "oo", "opo", "oho",
|
|
20
|
+
"hindi", "di",
|
|
21
|
+
|
|
22
|
+
# Particles (most diagnostic to Tagalog)
|
|
23
|
+
"po", "ho", "kasi", "naman", "talaga", "daw", "raw",
|
|
24
|
+
"ba", "rin", "pa", "lamang",
|
|
25
|
+
# "din" omitted: in CONFLICT_WORDS (UR time word overlap)
|
|
26
|
+
# "na" omitted: in CONFLICT_WORDS (UR + SW + DE overlap)
|
|
27
|
+
# "lang" omitted: in CONFLICT_WORDS (EN word overlap)
|
|
28
|
+
# "man" omitted: in CONFLICT_WORDS (TL + EN overlap)
|
|
29
|
+
# "para" omitted: in CONFLICT_WORDS (TL + ES/PT overlap)
|
|
30
|
+
"pala", "nga", "muna", "yata", "sana",
|
|
31
|
+
# "man" omitted: in CONFLICT_WORDS
|
|
32
|
+
|
|
33
|
+
# Existential
|
|
34
|
+
"may", "mayroon", "wala", "meron",
|
|
35
|
+
|
|
36
|
+
# Conjunctions / linkers
|
|
37
|
+
"at", "pero", "ngunit", "o", "kahit", "kaya", "kung",
|
|
38
|
+
"dahil", "habang", "kapag", "pagka", "bago", "matapos",
|
|
39
|
+
# "para" omitted: in CONFLICT_WORDS (ES/PT "for" overlap)
|
|
40
|
+
"upang", "sapagkat",
|
|
41
|
+
|
|
42
|
+
# Prepositions / focus markers
|
|
43
|
+
"sa", "kay", "nina", "mula", "hanggang",
|
|
44
|
+
# "para" omitted: in CONFLICT_WORDS
|
|
45
|
+
"tungkol", "patungkol", "ukol",
|
|
46
|
+
|
|
47
|
+
# High-frequency verbs (mag- / -um- / ma- forms)
|
|
48
|
+
"kumain", "kakain", "kumakain",
|
|
49
|
+
"pumunta", "pupunta", "pumupunta",
|
|
50
|
+
"umalis", "aalis", "umaalis",
|
|
51
|
+
"bumalik", "babalik", "bumabalik",
|
|
52
|
+
"magluto", "magluluto", "nagluluto",
|
|
53
|
+
"matulog", "matutulog", "natutulog",
|
|
54
|
+
"gumawa", "gagawa", "gumagawa",
|
|
55
|
+
"magsalita", "magsasalita",
|
|
56
|
+
"umiyak", "iiyak",
|
|
57
|
+
"tumawa", "tatawa",
|
|
58
|
+
"magbasa", "magbabasa",
|
|
59
|
+
"magsulat", "magsusulat",
|
|
60
|
+
"magbayad", "magbabayad",
|
|
61
|
+
"makita", "makikita",
|
|
62
|
+
"marinig", "maririnig",
|
|
63
|
+
"malaman", "malalaman",
|
|
64
|
+
|
|
65
|
+
# Adjectives / descriptors — common
|
|
66
|
+
"maganda", "magandang", "masaya", "malungkot",
|
|
67
|
+
"malaki", "maliit", "mahirap", "madali",
|
|
68
|
+
"mahal", "mura", "bago", "luma", "mabilis",
|
|
69
|
+
"mabagal", "mainit", "malamig", "masarap",
|
|
70
|
+
"maayos", "matagal", "maikli",
|
|
71
|
+
|
|
72
|
+
# Adverbs / degree words
|
|
73
|
+
"medyo", "sobra", "talagang", "halos",
|
|
74
|
+
"lagi", "palagi", "minsan", "madalas",
|
|
75
|
+
"maagang", "agad", "kaagad",
|
|
76
|
+
|
|
77
|
+
# Nouns — common
|
|
78
|
+
"bahay", "trabaho", "pamilya", "bata", "buhay",
|
|
79
|
+
"tao", "oras", "araw", "gabi", "umaga", "tanghali",
|
|
80
|
+
"hapon", "linggo", "buwan", "taon", "lugar",
|
|
81
|
+
"pagkain", "tubig", "kuya", "ate", "lola", "lolo",
|
|
82
|
+
"nanay", "tatay", "kaibigan", "kapitbahay",
|
|
83
|
+
"puso", "isip", "katawan", "kamay", "mata",
|
|
84
|
+
"daan", "kalsada", "eskwela", "opisina",
|
|
85
|
+
"pelikula", "musika", "laro",
|
|
86
|
+
|
|
87
|
+
# Greetings / social
|
|
88
|
+
"salamat", "kamusta", "kumusta", "paalam", "ingat",
|
|
89
|
+
"mabuhay", "maligayang", "pasensya",
|
|
90
|
+
|
|
91
|
+
# Numbers (written forms common in mixed text)
|
|
92
|
+
"isa", "dalawa", "tatlo", "apat", "lima",
|
|
93
|
+
"anim", "pito", "walo", "siyam", "sampu",
|
|
94
|
+
|
|
95
|
+
# Discourse fillers
|
|
96
|
+
"kaya", "tapos", "tsaka", "saka", "parang",
|
|
97
|
+
"ganon", "ganun", "gano", "ganito",
|
|
98
|
+
"ayun", "ayan", "yun", "yung", "yung", "yong",
|
|
99
|
+
"diba", "di", "ano", "eh",
|
|
100
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
TURKISH: set[str] = {
|
|
2
|
+
# Conjunctions — most diagnostic
|
|
3
|
+
"ama", "fakat", "lakin", "ancak",
|
|
4
|
+
"ve", "veya", "yahut",
|
|
5
|
+
# "ya" omitted: in CONFLICT_WORDS (ES/UR/SW overlap)
|
|
6
|
+
"ki", "ile", "hem",
|
|
7
|
+
"cunku", "cunki",
|
|
8
|
+
"eger", "yani", "oysa", "oysaki",
|
|
9
|
+
"madem", "mademki",
|
|
10
|
+
|
|
11
|
+
# High-frequency particles / postpositions
|
|
12
|
+
"bir", "bu", "o",
|
|
13
|
+
# "su" omitted: in CONFLICT_WORDS
|
|
14
|
+
"var", "yok",
|
|
15
|
+
"evet", "hayir",
|
|
16
|
+
"iyi", "cok", "daha", "en",
|
|
17
|
+
"sonra", "once", "simdi",
|
|
18
|
+
"bugun", "yarin", "dun",
|
|
19
|
+
"gibi", "kadar", "icin", "ile", "gore",
|
|
20
|
+
"hep", "hic", "az",
|
|
21
|
+
"zaten", "artik", "sadece", "bile",
|
|
22
|
+
"yine", "tekrar", "gene",
|
|
23
|
+
"hatta", "neyse", "belki",
|
|
24
|
+
|
|
25
|
+
# Pronouns
|
|
26
|
+
"ben", "sen", "biz", "siz", "onlar",
|
|
27
|
+
"benim", "senin", "onun", "bizim", "sizin",
|
|
28
|
+
"bana", "sana", "ona", "bize", "size", "onlara",
|
|
29
|
+
"beni", "seni", "onu", "bizi", "sizi",
|
|
30
|
+
"bende", "sende", "onda",
|
|
31
|
+
"benimle", "seninle", "onunla",
|
|
32
|
+
# "su" omitted: in CONFLICT_WORDS (ES/IT/PT pronoun/possessive overlap)
|
|
33
|
+
|
|
34
|
+
# Interrogatives
|
|
35
|
+
"neden", "niye", "nasil", "nerede", "ne",
|
|
36
|
+
"kim", "hangi", "kac", "nereye", "nereden",
|
|
37
|
+
"ne zaman",
|
|
38
|
+
|
|
39
|
+
# Common verbs — present / past stems
|
|
40
|
+
"istiyorum", "istiyorsun", "istiyor", "istiyoruz",
|
|
41
|
+
"biliyorum", "biliyorsun", "biliyor", "biliyoruz",
|
|
42
|
+
"gidiyorum", "gidiyorsun", "gidiyor",
|
|
43
|
+
"geliyorum", "geliyorsun", "geliyor",
|
|
44
|
+
"yapiyorum", "yapiyorsun", "yapiyor",
|
|
45
|
+
"goruyorum", "goruyorsun", "goruyor",
|
|
46
|
+
"diyorum", "diyorsun", "diyor",
|
|
47
|
+
"veriyorum", "veriyorsun", "veriyor",
|
|
48
|
+
"aliyorum", "aliyorsun", "aliyor",
|
|
49
|
+
"gitti", "geldi", "yapti", "soyledi",
|
|
50
|
+
"oldu", "olacak", "olabilir",
|
|
51
|
+
"gidecek", "gelecek", "yapacak",
|
|
52
|
+
"etmek", "yapmak", "gitmek", "gelmek",
|
|
53
|
+
"istemek", "bilmek", "gormek", "vermek",
|
|
54
|
+
|
|
55
|
+
# Negation / modal
|
|
56
|
+
"degil", "degilim", "degilsin",
|
|
57
|
+
"olmaz", "olur", "olabilir",
|
|
58
|
+
"yapamam", "gidemem",
|
|
59
|
+
|
|
60
|
+
# Common nouns
|
|
61
|
+
"zaman", "gun", "yil", "ay", "hafta",
|
|
62
|
+
"sabah", "aksam", "gece", "oglen",
|
|
63
|
+
"yer", "ev", "okul", "is", "yol",
|
|
64
|
+
"adam", "kadin", "cocuk", "arkadaslar",
|
|
65
|
+
"insan", "insanlar", "herkes", "kimse",
|
|
66
|
+
"sey", "seyler", "sorun", "durum",
|
|
67
|
+
"hayat", "dunya", "ulke", "sehir",
|
|
68
|
+
|
|
69
|
+
# Adjectives
|
|
70
|
+
"buyuk", "kucuk", "guzel", "kotu",
|
|
71
|
+
"yeni", "eski", "uzun", "kisa",
|
|
72
|
+
"sicak", "soguk", "sert", "yumusak",
|
|
73
|
+
"kolay", "zor", "dogru", "yanlis",
|
|
74
|
+
"onemli", "gercek", "buyuk",
|
|
75
|
+
|
|
76
|
+
# Greetings / social
|
|
77
|
+
"tamam", "tabii", "tabi",
|
|
78
|
+
"lutfen", "tesekkur", "merhaba",
|
|
79
|
+
"elbette", "gercekten", "kesinlikle",
|
|
80
|
+
"nasılsın", "naber",
|
|
81
|
+
|
|
82
|
+
# Discourse / fillers
|
|
83
|
+
"yani", "hani", "sanki", "mesela",
|
|
84
|
+
"aslinda", "ayrıca", "onun", "icin",
|
|
85
|
+
"boyle", "soyle", "boylece",
|
|
86
|
+
"ne", "kadar", "gece",
|
|
87
|
+
}
|
polystring/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: polystring
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Span-level language detection for mixed-language text
|
|
5
|
+
Project-URL: Homepage, https://github.com/saadlohani/polystring
|
|
6
|
+
Project-URL: Documentation, https://github.com/saadlohani/polystring#readme
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/saadlohani/polystring/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/saadlohani/polystring/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Saad Khan <saadlohani@yahoo.com>
|
|
10
|
+
License: MIT License
|
|
11
|
+
|
|
12
|
+
Copyright (c) 2026 Saad Khan
|
|
13
|
+
|
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
15
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
16
|
+
in the Software without restriction, including without limitation the rights
|
|
17
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
18
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
19
|
+
furnished to do so, subject to the following conditions:
|
|
20
|
+
|
|
21
|
+
The above copyright notice and this permission notice shall be included in all
|
|
22
|
+
copies or substantial portions of the Software.
|
|
23
|
+
|
|
24
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
25
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
26
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
27
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
28
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
29
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
30
|
+
SOFTWARE.
|
|
31
|
+
License-File: LICENSE
|
|
32
|
+
Keywords: code-switching,language-detection,mixed-language,multilingual,nlp
|
|
33
|
+
Classifier: Development Status :: 3 - Alpha
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Intended Audience :: Science/Research
|
|
36
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
|
+
Classifier: Operating System :: OS Independent
|
|
38
|
+
Classifier: Programming Language :: Python :: 3
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
43
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
44
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
45
|
+
Classifier: Typing :: Typed
|
|
46
|
+
Requires-Python: >=3.10
|
|
47
|
+
Requires-Dist: lingua-language-detector>=2.0
|
|
48
|
+
Requires-Dist: regex>=2023.0
|
|
49
|
+
Requires-Dist: typing-extensions>=4.0; python_version < '3.11'
|
|
50
|
+
Provides-Extra: dev
|
|
51
|
+
Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: pre-commit; extra == 'dev'
|
|
53
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
54
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
55
|
+
Requires-Dist: ruff>=0.1; extra == 'dev'
|
|
56
|
+
Provides-Extra: pandas
|
|
57
|
+
Requires-Dist: pandas>=1.5; extra == 'pandas'
|
|
58
|
+
Description-Content-Type: text/markdown
|
|
59
|
+
|
|
60
|
+
# polystring
|
|
61
|
+
|
|
62
|
+
**Span-level language detection for mixed-language text.**
|
|
63
|
+
|
|
64
|
+
Most language detection libraries return a single label for the whole string. polystring returns a labelled span for _every part_ of the sentence, with character offsets, confidence scores, and special-token extraction baked in.
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from polystring import analyze
|
|
68
|
+
|
|
69
|
+
result = analyze("je suis tellement tired, this week has been rough")
|
|
70
|
+
|
|
71
|
+
for span in result.spans:
|
|
72
|
+
print(f"[{span.language}] {span.text!r}")
|
|
73
|
+
|
|
74
|
+
# [fr] 'je suis tellement tired'
|
|
75
|
+
# [en] 'this week has been rough'
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Why span-level detection matters
|
|
79
|
+
|
|
80
|
+
| Tool | What it returns for `"hola I love this city, en serio"` |
|
|
81
|
+
| -------------- | ------------------------------------------------------------- |
|
|
82
|
+
| `langdetect` | `"es"` (labels the whole string) |
|
|
83
|
+
| `lingua` | `"es"` (labels the whole string) |
|
|
84
|
+
| `langid` | `"es"` (labels the whole string) |
|
|
85
|
+
| **polystring** | `[es] "hola"` · `[en] "I love this city"` · `[es] "en serio"` |
|
|
86
|
+
|
|
87
|
+
Code-switching (mixing languages within a single sentence) is normal on social media, in diaspora communities, in customer support chats, and in any multilingual context. A single label for the whole input misses the structure entirely. polystring is built specifically for this problem.
|
|
88
|
+
|
|
89
|
+
## Installation
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pip install polystring
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Optional extras:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
pip install polystring[pandas] # enables result.to_dataframe()
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Examples
|
|
104
|
+
|
|
105
|
+
### Spanish / English (Spanglish)
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
result = analyze("no puedo creer how good this restaurant is, en serio")
|
|
109
|
+
|
|
110
|
+
for span in result.spans:
|
|
111
|
+
print(f"[{span.language}] {span.text!r}")
|
|
112
|
+
|
|
113
|
+
# [es] 'no puedo creer'
|
|
114
|
+
# [en] 'how good this restaurant is'
|
|
115
|
+
# [es] 'en serio'
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### French / English
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
result = analyze("je suis tellement tired lately, I need des vacances")
|
|
122
|
+
|
|
123
|
+
for span in result.spans:
|
|
124
|
+
print(f"[{span.language}] {span.text!r}")
|
|
125
|
+
|
|
126
|
+
# [fr] 'je suis tellement tired lately'
|
|
127
|
+
# [en] 'I need'
|
|
128
|
+
# [fr] 'des vacances'
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Mixed with non-Latin scripts
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
result = analyze("this is great هذا رائع جداً I am very impressed")
|
|
135
|
+
|
|
136
|
+
for span in result.spans:
|
|
137
|
+
print(f"[{span.language}] {span.text!r}")
|
|
138
|
+
|
|
139
|
+
# [en] 'this is great'
|
|
140
|
+
# [ar] 'هذا رائع جداً'
|
|
141
|
+
# [en] 'I am very impressed'
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Working with results
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
result = analyze("je suis tellement tired, this week has been rough")
|
|
148
|
+
|
|
149
|
+
result.dominant_language # 'fr'
|
|
150
|
+
result.is_mixed # True
|
|
151
|
+
result.languages # {'fr', 'en'}
|
|
152
|
+
result.confidence # 0.87 (mean confidence across linguistic spans)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Serialise to dict
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
result.to_dict()
|
|
159
|
+
# {
|
|
160
|
+
# 'text': 'je suis tellement tired, this week has been rough',
|
|
161
|
+
# 'spans': [{'text': 'je suis tellement tired', 'language': 'fr', ...}, ...],
|
|
162
|
+
# 'dominant_language': 'fr',
|
|
163
|
+
# 'is_mixed': True,
|
|
164
|
+
# 'confidence': 0.87,
|
|
165
|
+
# ...
|
|
166
|
+
# }
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Serialise to DataFrame
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
# pip install polystring[pandas]
|
|
173
|
+
df = result.to_dataframe()
|
|
174
|
+
# text language token_type confidence start end is_foreign
|
|
175
|
+
# 0 je suis tellement tired fr text 0.91 0 23 False
|
|
176
|
+
# 1 this week has been rough en text 0.84 25 49 True
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### ANSI-coloured terminal output
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
print(result.highlight())
|
|
183
|
+
# [fr]je suis tellement tired [en]this week has been rough
|
|
184
|
+
# (each language rendered in a distinct colour)
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Filter to linguistic spans only
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
result.linguistic_spans()
|
|
191
|
+
# Returns spans with token_type == "text" only (no URLs, emoji, mentions, etc.)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Span fields
|
|
195
|
+
|
|
196
|
+
| Field | Type | Description |
|
|
197
|
+
| ---------------------- | ----------- | --------------------------------------------------------------------------------------------- |
|
|
198
|
+
| `text` | `str` | Text as it appears in the input |
|
|
199
|
+
| `language` | `str` | ISO 639-1 code. `"ur-Latn"` for Roman Urdu, `"und"` for undetermined, `"ne"` for proper nouns |
|
|
200
|
+
| `token_type` | `str` | `"text"`, `"url"`, `"mention"`, `"hashtag"`, `"emoji"`, `"num"`, or `"ne"` |
|
|
201
|
+
| `confidence` | `float` | 0.0 to 1.0. Non-text tokens are always 0.0 |
|
|
202
|
+
| `start` / `end` | `int` | Character offsets into the original string |
|
|
203
|
+
| `is_foreign` | `bool` | `True` if this span is not the dominant language |
|
|
204
|
+
| `ambiguous_candidates` | `list[str]` | Populated when `language == "und"` due to a near-identical pair (e.g. `["es", "pt"]`) |
|
|
205
|
+
|
|
206
|
+
## Language coverage
|
|
207
|
+
|
|
208
|
+
polystring detects **75 languages** via [lingua](https://github.com/pemistahl/lingua-py). Non-Latin scripts (Arabic, Devanagari, CJK, Cyrillic, Thai, Hebrew, Korean, and more) are identified directly from Unicode ranges, no model call needed.
|
|
209
|
+
|
|
210
|
+
Nine languages have dedicated lexicons on top of the model, which significantly improves accuracy on short spans and code-switched text: Roman Urdu / Hinglish, Spanish, Portuguese, Italian, German, Turkish, Tagalog, Swahili
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
import polystring
|
|
214
|
+
print(polystring.supported_languages()) # full list of 75 ISO 639-1 codes
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Options
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
analyze(
|
|
221
|
+
text,
|
|
222
|
+
languages=["es", "en"], # restrict to known language set (faster, fewer false positives)
|
|
223
|
+
granularity="token", # "span" (default) or "token" to get per-word data
|
|
224
|
+
min_confidence=0.70, # tokens below this threshold become "und"
|
|
225
|
+
low_accuracy_mode=False, # lexicon + script detection only, no model (very fast)
|
|
226
|
+
normalize=True, # NFC normalisation
|
|
227
|
+
custom_lexicon={"sw": ["mambo", "vipi"]}, # inject domain-specific words
|
|
228
|
+
)
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### `granularity="token"` gives per-word data
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
result = analyze("bonjour how are you doing", granularity="token")
|
|
235
|
+
|
|
236
|
+
for tok in result.tokens:
|
|
237
|
+
print(f"[{tok.language}] {tok.text!r} ({tok.confidence:.2f})")
|
|
238
|
+
|
|
239
|
+
# [fr] 'bonjour' (0.92)
|
|
240
|
+
# [en] 'how' (0.83)
|
|
241
|
+
# [en] 'are' (0.81)
|
|
242
|
+
# [en] 'you' (0.85)
|
|
243
|
+
# [en] 'doing' (0.88)
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Contributing
|
|
247
|
+
|
|
248
|
+
```bash
|
|
249
|
+
git clone https://github.com/saadlohani/polystring
|
|
250
|
+
cd polystring
|
|
251
|
+
pip install -e ".[dev]"
|
|
252
|
+
pytest
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
## License
|
|
256
|
+
|
|
257
|
+
MIT
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
polystring/__init__.py,sha256=F4K4bX0InYaeFRuy6vq_7-b57bmrVgIsdVujzCstxZQ,763
|
|
2
|
+
polystring/_analyzer.py,sha256=m3rQ5PuWNQcwZEsxdmkxVebxNF0mOglPIryDpAd-QtQ,4506
|
|
3
|
+
polystring/_detector.py,sha256=Nd42QCoDM9sxPM1LeU2xamPhhzAOuHIdhki13H2CXaU,2374
|
|
4
|
+
polystring/_exceptions.py,sha256=KynuxDcgiTMh0Q9ne56feSvRPseIXPUxub-RNe8R90Y,492
|
|
5
|
+
polystring/_models.py,sha256=aQZx7tbV5JBm_mcvPliQUt3f4EXVLzdwqdcLziklRSc,3070
|
|
6
|
+
polystring/_ngram.py,sha256=SKZ5cr8dqTXgfU8MzUqc7_SM6jHxtyGNUuqjNPD_b00,4767
|
|
7
|
+
polystring/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
+
polystring/_pipeline/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
polystring/_pipeline/stage1_preprocess.py,sha256=YYwack0tjsP0oKUgZ2i0l5XSL0Ex3fsHEzR_zFkextA,3974
|
|
10
|
+
polystring/_pipeline/stage2_script.py,sha256=Zai8Zp9QEWLnsXLVDjGImGv1-YIpnvn4NHX01bfyQeU,3103
|
|
11
|
+
polystring/_pipeline/stage3_classify.py,sha256=LSMSxwGAX8ygr2OS8wrtTNHDpcrFNKEI-CpT8BliNes,6026
|
|
12
|
+
polystring/_pipeline/stage4_context.py,sha256=qT2FUiB_THBSMYHZTd7_95XsksTY2pEZAJaGJwms5Fc,3905
|
|
13
|
+
polystring/_pipeline/stage5_merge.py,sha256=sty1mRkfUUCifh4TIRWl6urpFduHedJuygfFtdsYC4M,4055
|
|
14
|
+
polystring/data/_background_ngram.json,sha256=PaIB7ahPqeoNfvoWOrntjYBOCDPf6WCWeTs3uowu7Zc,205636
|
|
15
|
+
polystring/data/sw_ngram.json,sha256=aDp_cESiKDcZVOurfuYh74fr9zI2huC90h7mrIXvgwM,90784
|
|
16
|
+
polystring/data/tl_ngram.json,sha256=ZE7jmNbQOSSpV5cAiYg9swR7PIUqtAlZpl4Tl0WYq78,53789
|
|
17
|
+
polystring/data/ur_Latn_ngram.json,sha256=w6003cWfauozf-nrxOO3-d2K_GjeDoMWF4s5BT130HM,70966
|
|
18
|
+
polystring/lexicons/__init__.py,sha256=Wa232ouDu2S15p4h-cyJL-alulbmrR7DHpK1mu9DQEk,4778
|
|
19
|
+
polystring/lexicons/french.py,sha256=RzaicKSuLQkydzvS_ilHIC8laR3GVFwB1J3O6rDqQuE,4109
|
|
20
|
+
polystring/lexicons/german.py,sha256=fyUvG6yjEDp2IiMnmyyjW-uY9FwcGGu3fHGLwoBt7go,3696
|
|
21
|
+
polystring/lexicons/italian.py,sha256=PyLFVRXCQZ3OeEUq3vBDtDEUtWY0j-kKzGVRa81LLec,3799
|
|
22
|
+
polystring/lexicons/portuguese.py,sha256=dfIBnrvDsJZlUXwSfmwRKB6iwDMonwiZqupWdr7mE5s,3714
|
|
23
|
+
polystring/lexicons/roman_urdu.py,sha256=E7vrZ1V8Ct3BSj66bCAh3As4Hqv8Xo_zgA-4XLuH2NA,5536
|
|
24
|
+
polystring/lexicons/spanish.py,sha256=FUkMhfLF6-F3vWnSTGVXhl07Ycg_Q7dMfws_d89ltbI,3666
|
|
25
|
+
polystring/lexicons/swahili.py,sha256=AITA0JtvQnNfq5KFa4pwX-h9Victtgl4hm2RKvdBddI,3142
|
|
26
|
+
polystring/lexicons/tagalog.py,sha256=IL8d5LVKwPHeRpOiOo-8VqVF2rqMAseOp3xa4WSfTwI,3525
|
|
27
|
+
polystring/lexicons/turkish.py,sha256=VIecmE3qPOqmJMw_v1fp2fqzs9T3kStE13_S6wRhTGk,2776
|
|
28
|
+
polystring-0.1.0.dist-info/METADATA,sha256=WwxGjj16R0_eDEvAWBvcrxILfRWKR6cbt3IyXIZvK_0,9481
|
|
29
|
+
polystring-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
30
|
+
polystring-0.1.0.dist-info/licenses/LICENSE,sha256=rUUE-ALdkoMOjO4rcHpV1mDkaFZzttO9xvkdZI_WNuA,1066
|
|
31
|
+
polystring-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Saad Khan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|