polystring 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polystring-0.1.0/.gitattributes +2 -0
- polystring-0.1.0/.github/workflows/ci.yml +78 -0
- polystring-0.1.0/.gitignore +46 -0
- polystring-0.1.0/.pre-commit-config.yaml +18 -0
- polystring-0.1.0/CHANGELOG.md +25 -0
- polystring-0.1.0/LICENSE +21 -0
- polystring-0.1.0/PKG-INFO +257 -0
- polystring-0.1.0/README.md +198 -0
- polystring-0.1.0/pyproject.toml +71 -0
- polystring-0.1.0/src/polystring/__init__.py +29 -0
- polystring-0.1.0/src/polystring/_analyzer.py +133 -0
- polystring-0.1.0/src/polystring/_detector.py +74 -0
- polystring-0.1.0/src/polystring/_exceptions.py +17 -0
- polystring-0.1.0/src/polystring/_models.py +106 -0
- polystring-0.1.0/src/polystring/_ngram.py +144 -0
- polystring-0.1.0/src/polystring/_pipeline/__init__.py +0 -0
- polystring-0.1.0/src/polystring/_pipeline/stage1_preprocess.py +134 -0
- polystring-0.1.0/src/polystring/_pipeline/stage2_script.py +104 -0
- polystring-0.1.0/src/polystring/_pipeline/stage3_classify.py +176 -0
- polystring-0.1.0/src/polystring/_pipeline/stage4_context.py +108 -0
- polystring-0.1.0/src/polystring/_pipeline/stage5_merge.py +138 -0
- polystring-0.1.0/src/polystring/data/_background_ngram.json +1 -0
- polystring-0.1.0/src/polystring/data/sw_ngram.json +1 -0
- polystring-0.1.0/src/polystring/data/tl_ngram.json +1 -0
- polystring-0.1.0/src/polystring/data/ur_Latn_ngram.json +1 -0
- polystring-0.1.0/src/polystring/lexicons/__init__.py +116 -0
- polystring-0.1.0/src/polystring/lexicons/french.py +113 -0
- polystring-0.1.0/src/polystring/lexicons/german.py +111 -0
- polystring-0.1.0/src/polystring/lexicons/italian.py +113 -0
- polystring-0.1.0/src/polystring/lexicons/portuguese.py +117 -0
- polystring-0.1.0/src/polystring/lexicons/roman_urdu.py +130 -0
- polystring-0.1.0/src/polystring/lexicons/spanish.py +111 -0
- polystring-0.1.0/src/polystring/lexicons/swahili.py +89 -0
- polystring-0.1.0/src/polystring/lexicons/tagalog.py +100 -0
- polystring-0.1.0/src/polystring/lexicons/turkish.py +87 -0
- polystring-0.1.0/src/polystring/py.typed +0 -0
- polystring-0.1.0/tests/__init__.py +0 -0
- polystring-0.1.0/tests/fixtures/mixed_text_samples.json +271 -0
- polystring-0.1.0/tests/test_analyzer.py +186 -0
- polystring-0.1.0/tests/test_context_correction.py +60 -0
- polystring-0.1.0/tests/test_lexicons.py +75 -0
- polystring-0.1.0/tests/test_ngram.py +133 -0
- polystring-0.1.0/tests/test_preprocessing.py +51 -0
- polystring-0.1.0/tests/test_real_world.py +372 -0
- polystring-0.1.0/tests/test_script_detection.py +43 -0
- polystring-0.1.0/tests/test_span_merging.py +65 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
tags: ["v*"]
|
|
7
|
+
pull_request:
|
|
8
|
+
branches: [main]
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
test:
|
|
12
|
+
name: Test (Python ${{ matrix.python-version }})
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
strategy:
|
|
15
|
+
fail-fast: false
|
|
16
|
+
matrix:
|
|
17
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
18
|
+
|
|
19
|
+
steps:
|
|
20
|
+
- uses: actions/checkout@v4
|
|
21
|
+
|
|
22
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
23
|
+
uses: actions/setup-python@v5
|
|
24
|
+
with:
|
|
25
|
+
python-version: ${{ matrix.python-version }}
|
|
26
|
+
cache: pip
|
|
27
|
+
|
|
28
|
+
- name: Install dependencies
|
|
29
|
+
run: pip install -e ".[dev]"
|
|
30
|
+
|
|
31
|
+
- name: Run tests
|
|
32
|
+
run: pytest --no-cov -q
|
|
33
|
+
|
|
34
|
+
lint:
|
|
35
|
+
name: Lint
|
|
36
|
+
runs-on: ubuntu-latest
|
|
37
|
+
|
|
38
|
+
steps:
|
|
39
|
+
- uses: actions/checkout@v4
|
|
40
|
+
|
|
41
|
+
- name: Set up Python
|
|
42
|
+
uses: actions/setup-python@v5
|
|
43
|
+
with:
|
|
44
|
+
python-version: "3.13"
|
|
45
|
+
cache: pip
|
|
46
|
+
|
|
47
|
+
- name: Install ruff
|
|
48
|
+
run: pip install ruff>=0.1
|
|
49
|
+
|
|
50
|
+
- name: Run ruff
|
|
51
|
+
run: ruff check src/ tests/
|
|
52
|
+
|
|
53
|
+
publish:
|
|
54
|
+
name: Publish to PyPI
|
|
55
|
+
runs-on: ubuntu-latest
|
|
56
|
+
needs: [test, lint]
|
|
57
|
+
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
|
58
|
+
environment: pypi
|
|
59
|
+
permissions:
|
|
60
|
+
id-token: write
|
|
61
|
+
|
|
62
|
+
steps:
|
|
63
|
+
- uses: actions/checkout@v4
|
|
64
|
+
|
|
65
|
+
- name: Set up Python
|
|
66
|
+
uses: actions/setup-python@v5
|
|
67
|
+
with:
|
|
68
|
+
python-version: "3.13"
|
|
69
|
+
cache: pip
|
|
70
|
+
|
|
71
|
+
- name: Install hatch
|
|
72
|
+
run: pip install hatch
|
|
73
|
+
|
|
74
|
+
- name: Build
|
|
75
|
+
run: hatch build
|
|
76
|
+
|
|
77
|
+
- name: Publish to PyPI
|
|
78
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.pyo
|
|
5
|
+
*.pyd
|
|
6
|
+
*.so
|
|
7
|
+
*.egg
|
|
8
|
+
*.egg-info/
|
|
9
|
+
build/
|
|
10
|
+
.eggs/
|
|
11
|
+
|
|
12
|
+
# Virtual environments
|
|
13
|
+
.venv/
|
|
14
|
+
venv/
|
|
15
|
+
env/
|
|
16
|
+
.env
|
|
17
|
+
|
|
18
|
+
# Testing & coverage
|
|
19
|
+
.pytest_cache/
|
|
20
|
+
.coverage
|
|
21
|
+
.coverage.*
|
|
22
|
+
coverage.xml
|
|
23
|
+
htmlcov/
|
|
24
|
+
|
|
25
|
+
# Type checking
|
|
26
|
+
.mypy_cache/
|
|
27
|
+
.dmypy.json
|
|
28
|
+
|
|
29
|
+
# Ruff
|
|
30
|
+
.ruff_cache/
|
|
31
|
+
|
|
32
|
+
# Build / packaging
|
|
33
|
+
dist/
|
|
34
|
+
*.whl
|
|
35
|
+
*.tar.gz
|
|
36
|
+
|
|
37
|
+
# IDEs
|
|
38
|
+
.vscode/
|
|
39
|
+
.idea/
|
|
40
|
+
*.swp
|
|
41
|
+
*.swo
|
|
42
|
+
|
|
43
|
+
# OS
|
|
44
|
+
.DS_Store
|
|
45
|
+
desktop.ini
|
|
46
|
+
Thumbs.db
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
rev: v0.4.4
|
|
4
|
+
hooks:
|
|
5
|
+
- id: ruff
|
|
6
|
+
args: [--fix]
|
|
7
|
+
- id: ruff-format
|
|
8
|
+
|
|
9
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
10
|
+
rev: v4.6.0
|
|
11
|
+
hooks:
|
|
12
|
+
- id: trailing-whitespace
|
|
13
|
+
- id: end-of-file-fixer
|
|
14
|
+
- id: check-yaml
|
|
15
|
+
- id: check-toml
|
|
16
|
+
- id: check-added-large-files
|
|
17
|
+
args: [--maxkb=500]
|
|
18
|
+
- id: debug-statements
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.1.0] - 2026-06-13
|
|
4
|
+
|
|
5
|
+
Initial release.
|
|
6
|
+
|
|
7
|
+
### Detection capabilities
|
|
8
|
+
- Span-level language detection: each span of a mixed-language input is labelled independently, with character offsets into the original string
|
|
9
|
+
- 75 languages via [lingua](https://github.com/pemistahl/lingua-py), plus dedicated lexicons for Roman Urdu (`ur-Latn`), Tagalog (`tl`), Swahili (`sw`), French (`fr`), Spanish (`es`), Portuguese (`pt`), Italian (`it`), German (`de`), and Turkish (`tr`)
|
|
10
|
+
- Character n-gram models for Roman Urdu, Tagalog, and Swahili, enabling detection of content words not in the lexicon. Models are pre-bundled and loaded at import time
|
|
11
|
+
- Non-Latin scripts identified directly from Unicode block ranges (Arabic, Devanagari, CJK, Cyrillic, Thai, Hebrew, Georgian, Korean, Japanese, Bengali, and more) without calling the language model
|
|
12
|
+
|
|
13
|
+
### Robustness features
|
|
14
|
+
- URLs, @mentions, #hashtags, emoji, and numbers are extracted before detection and reinserted in the output, so they never corrupt the language signal
|
|
15
|
+
- Mid-sentence capitalised tokens inconsistent with surrounding context are tagged `ne` (named entity / proper noun) rather than assigned a spurious language
|
|
16
|
+
- Near-identical language pairs (Spanish/Portuguese, Norwegian/Danish/Swedish, Indonesian/Malay, Croatian/Serbian) are returned as `und` with `ambiguous_candidates` populated, rather than a confident wrong answer
|
|
17
|
+
- Context correction pass resolves undetermined tokens by majority vote over a +/- 3 token window, and absorbs single-token language islands into surrounding context
|
|
18
|
+
|
|
19
|
+
### API
|
|
20
|
+
- `analyze(text, *, languages, granularity, min_confidence, low_accuracy_mode, normalize, custom_lexicon)`
|
|
21
|
+
- `PolyStringResult`: `.spans`, `.languages`, `.dominant_language`, `.is_mixed`, `.confidence`, `.to_dict()`, `.to_dataframe()`, `.highlight()`, `.linguistic_spans()`
|
|
22
|
+
- `Span`: `text`, `language`, `token_type`, `confidence`, `start`, `end`, `is_foreign`, `ambiguous_candidates`
|
|
23
|
+
- `supported_languages()` returning the full list of detectable ISO codes
|
|
24
|
+
- `PolyStringError`, `UnsupportedLanguageError`, `InputTooShortError`
|
|
25
|
+
- Optional pandas integration: `pip install polystring[pandas]`
|
polystring-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Saad Khan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: polystring
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Span-level language detection for mixed-language text
|
|
5
|
+
Project-URL: Homepage, https://github.com/saadlohani/polystring
|
|
6
|
+
Project-URL: Documentation, https://github.com/saadlohani/polystring#readme
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/saadlohani/polystring/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/saadlohani/polystring/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Saad Khan <saadlohani@yahoo.com>
|
|
10
|
+
License: MIT License
|
|
11
|
+
|
|
12
|
+
Copyright (c) 2026 Saad Khan
|
|
13
|
+
|
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
15
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
16
|
+
in the Software without restriction, including without limitation the rights
|
|
17
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
18
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
19
|
+
furnished to do so, subject to the following conditions:
|
|
20
|
+
|
|
21
|
+
The above copyright notice and this permission notice shall be included in all
|
|
22
|
+
copies or substantial portions of the Software.
|
|
23
|
+
|
|
24
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
25
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
26
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
27
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
28
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
29
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
30
|
+
SOFTWARE.
|
|
31
|
+
License-File: LICENSE
|
|
32
|
+
Keywords: code-switching,language-detection,mixed-language,multilingual,nlp
|
|
33
|
+
Classifier: Development Status :: 3 - Alpha
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Intended Audience :: Science/Research
|
|
36
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
|
+
Classifier: Operating System :: OS Independent
|
|
38
|
+
Classifier: Programming Language :: Python :: 3
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
42
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
43
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
44
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
45
|
+
Classifier: Typing :: Typed
|
|
46
|
+
Requires-Python: >=3.10
|
|
47
|
+
Requires-Dist: lingua-language-detector>=2.0
|
|
48
|
+
Requires-Dist: regex>=2023.0
|
|
49
|
+
Requires-Dist: typing-extensions>=4.0; python_version < '3.11'
|
|
50
|
+
Provides-Extra: dev
|
|
51
|
+
Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: pre-commit; extra == 'dev'
|
|
53
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
54
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
55
|
+
Requires-Dist: ruff>=0.1; extra == 'dev'
|
|
56
|
+
Provides-Extra: pandas
|
|
57
|
+
Requires-Dist: pandas>=1.5; extra == 'pandas'
|
|
58
|
+
Description-Content-Type: text/markdown
|
|
59
|
+
|
|
60
|
+
# polystring
|
|
61
|
+
|
|
62
|
+
**Span-level language detection for mixed-language text.**
|
|
63
|
+
|
|
64
|
+
Most language detection libraries return a single label for the whole string. polystring returns a labelled span for _every part_ of the sentence, with character offsets, confidence scores, and special-token extraction baked in.
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
from polystring import analyze
|
|
68
|
+
|
|
69
|
+
result = analyze("je suis tellement tired, this week has been rough")
|
|
70
|
+
|
|
71
|
+
for span in result.spans:
|
|
72
|
+
print(f"[{span.language}] {span.text!r}")
|
|
73
|
+
|
|
74
|
+
# [fr] 'je suis tellement tired'
|
|
75
|
+
# [en] 'this week has been rough'
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Why span-level detection matters
|
|
79
|
+
|
|
80
|
+
| Tool | What it returns for `"hola I love this city, en serio"` |
|
|
81
|
+
| -------------- | ------------------------------------------------------------- |
|
|
82
|
+
| `langdetect` | `"es"` (labels the whole string) |
|
|
83
|
+
| `lingua` | `"es"` (labels the whole string) |
|
|
84
|
+
| `langid` | `"es"` (labels the whole string) |
|
|
85
|
+
| **polystring** | `[es] "hola"` · `[en] "I love this city"` · `[es] "en serio"` |
|
|
86
|
+
|
|
87
|
+
Code-switching (mixing languages within a single sentence) is normal on social media, in diaspora communities, in customer support chats, and in any multilingual context. A single label for the whole input misses the structure entirely. polystring is built specifically for this problem.
|
|
88
|
+
|
|
89
|
+
## Installation
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pip install polystring
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Optional extras:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
pip install polystring[pandas] # enables result.to_dataframe()
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## Examples
|
|
104
|
+
|
|
105
|
+
### Spanish / English (Spanglish)
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
result = analyze("no puedo creer how good this restaurant is, en serio")
|
|
109
|
+
|
|
110
|
+
for span in result.spans:
|
|
111
|
+
print(f"[{span.language}] {span.text!r}")
|
|
112
|
+
|
|
113
|
+
# [es] 'no puedo creer'
|
|
114
|
+
# [en] 'how good this restaurant is'
|
|
115
|
+
# [es] 'en serio'
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### French / English
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
result = analyze("je suis tellement tired lately, I need des vacances")
|
|
122
|
+
|
|
123
|
+
for span in result.spans:
|
|
124
|
+
print(f"[{span.language}] {span.text!r}")
|
|
125
|
+
|
|
126
|
+
# [fr] 'je suis tellement tired lately'
|
|
127
|
+
# [en] 'I need'
|
|
128
|
+
# [fr] 'des vacances'
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Mixed with non-Latin scripts
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
result = analyze("this is great هذا رائع جداً I am very impressed")
|
|
135
|
+
|
|
136
|
+
for span in result.spans:
|
|
137
|
+
print(f"[{span.language}] {span.text!r}")
|
|
138
|
+
|
|
139
|
+
# [en] 'this is great'
|
|
140
|
+
# [ar] 'هذا رائع جداً'
|
|
141
|
+
# [en] 'I am very impressed'
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Working with results
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
result = analyze("je suis tellement tired, this week has been rough")
|
|
148
|
+
|
|
149
|
+
result.dominant_language # 'fr'
|
|
150
|
+
result.is_mixed # True
|
|
151
|
+
result.languages # {'fr', 'en'}
|
|
152
|
+
result.confidence # 0.87 (mean confidence across linguistic spans)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Serialise to dict
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
result.to_dict()
|
|
159
|
+
# {
|
|
160
|
+
# 'text': 'je suis tellement tired, this week has been rough',
|
|
161
|
+
# 'spans': [{'text': 'je suis tellement tired', 'language': 'fr', ...}, ...],
|
|
162
|
+
# 'dominant_language': 'fr',
|
|
163
|
+
# 'is_mixed': True,
|
|
164
|
+
# 'confidence': 0.87,
|
|
165
|
+
# ...
|
|
166
|
+
# }
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Serialise to DataFrame
|
|
170
|
+
|
|
171
|
+
```python
|
|
172
|
+
# pip install polystring[pandas]
|
|
173
|
+
df = result.to_dataframe()
|
|
174
|
+
# text language token_type confidence start end is_foreign
|
|
175
|
+
# 0 je suis tellement tired fr text 0.91 0 23 False
|
|
176
|
+
# 1 this week has been rough en text 0.84 25 49 True
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### ANSI-coloured terminal output
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
print(result.highlight())
|
|
183
|
+
# [fr]je suis tellement tired [en]this week has been rough
|
|
184
|
+
# (each language rendered in a distinct colour)
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Filter to linguistic spans only
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
result.linguistic_spans()
|
|
191
|
+
# Returns spans with token_type == "text" only (no URLs, emoji, mentions, etc.)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Span fields
|
|
195
|
+
|
|
196
|
+
| Field | Type | Description |
|
|
197
|
+
| ---------------------- | ----------- | --------------------------------------------------------------------------------------------- |
|
|
198
|
+
| `text` | `str` | Text as it appears in the input |
|
|
199
|
+
| `language` | `str` | ISO 639-1 code. `"ur-Latn"` for Roman Urdu, `"und"` for undetermined, `"ne"` for proper nouns |
|
|
200
|
+
| `token_type` | `str` | `"text"`, `"url"`, `"mention"`, `"hashtag"`, `"emoji"`, `"num"`, or `"ne"` |
|
|
201
|
+
| `confidence` | `float` | 0.0 to 1.0. Non-text tokens are always 0.0 |
|
|
202
|
+
| `start` / `end` | `int` | Character offsets into the original string |
|
|
203
|
+
| `is_foreign` | `bool` | `True` if this span is not the dominant language |
|
|
204
|
+
| `ambiguous_candidates` | `list[str]` | Populated when `language == "und"` due to a near-identical pair (e.g. `["es", "pt"]`) |
|
|
205
|
+
|
|
206
|
+
## Language coverage
|
|
207
|
+
|
|
208
|
+
polystring detects **75 languages** via [lingua](https://github.com/pemistahl/lingua-py). Non-Latin scripts (Arabic, Devanagari, CJK, Cyrillic, Thai, Hebrew, Korean, and more) are identified directly from Unicode ranges, no model call needed.
|
|
209
|
+
|
|
210
|
+
Nine languages have dedicated lexicons on top of the model, which significantly improves accuracy on short spans and code-switched text: Roman Urdu / Hinglish, Spanish, Portuguese, Italian, German, Turkish, Tagalog, Swahili
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
import polystring
|
|
214
|
+
print(polystring.supported_languages()) # full list of 75 ISO 639-1 codes
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Options
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
analyze(
|
|
221
|
+
text,
|
|
222
|
+
languages=["es", "en"], # restrict to known language set (faster, fewer false positives)
|
|
223
|
+
granularity="token", # "span" (default) or "token" to get per-word data
|
|
224
|
+
min_confidence=0.70, # tokens below this threshold become "und"
|
|
225
|
+
low_accuracy_mode=False, # lexicon + script detection only, no model (very fast)
|
|
226
|
+
normalize=True, # NFC normalisation
|
|
227
|
+
custom_lexicon={"sw": ["mambo", "vipi"]}, # inject domain-specific words
|
|
228
|
+
)
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### `granularity="token"` gives per-word data
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
result = analyze("bonjour how are you doing", granularity="token")
|
|
235
|
+
|
|
236
|
+
for tok in result.tokens:
|
|
237
|
+
print(f"[{tok.language}] {tok.text!r} ({tok.confidence:.2f})")
|
|
238
|
+
|
|
239
|
+
# [fr] 'bonjour' (0.92)
|
|
240
|
+
# [en] 'how' (0.83)
|
|
241
|
+
# [en] 'are' (0.81)
|
|
242
|
+
# [en] 'you' (0.85)
|
|
243
|
+
# [en] 'doing' (0.88)
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Contributing
|
|
247
|
+
|
|
248
|
+
```bash
|
|
249
|
+
git clone https://github.com/saadlohani/polystring
|
|
250
|
+
cd polystring
|
|
251
|
+
pip install -e ".[dev]"
|
|
252
|
+
pytest
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
## License
|
|
256
|
+
|
|
257
|
+
MIT
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
# polystring
|
|
2
|
+
|
|
3
|
+
**Span-level language detection for mixed-language text.**
|
|
4
|
+
|
|
5
|
+
Most language detection libraries return a single label for the whole string. polystring returns a labelled span for _every part_ of the sentence, with character offsets, confidence scores, and special-token extraction baked in.
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from polystring import analyze
|
|
9
|
+
|
|
10
|
+
result = analyze("je suis tellement tired, this week has been rough")
|
|
11
|
+
|
|
12
|
+
for span in result.spans:
|
|
13
|
+
print(f"[{span.language}] {span.text!r}")
|
|
14
|
+
|
|
15
|
+
# [fr] 'je suis tellement tired'
|
|
16
|
+
# [en] 'this week has been rough'
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Why span-level detection matters
|
|
20
|
+
|
|
21
|
+
| Tool | What it returns for `"hola I love this city, en serio"` |
|
|
22
|
+
| -------------- | ------------------------------------------------------------- |
|
|
23
|
+
| `langdetect` | `"es"` (labels the whole string) |
|
|
24
|
+
| `lingua` | `"es"` (labels the whole string) |
|
|
25
|
+
| `langid` | `"es"` (labels the whole string) |
|
|
26
|
+
| **polystring** | `[es] "hola"` · `[en] "I love this city"` · `[es] "en serio"` |
|
|
27
|
+
|
|
28
|
+
Code-switching (mixing languages within a single sentence) is normal on social media, in diaspora communities, in customer support chats, and in any multilingual context. A single label for the whole input misses the structure entirely. polystring is built specifically for this problem.
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install polystring
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Optional extras:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install polystring[pandas] # enables result.to_dataframe()
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## Examples
|
|
45
|
+
|
|
46
|
+
### Spanish / English (Spanglish)
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
result = analyze("no puedo creer how good this restaurant is, en serio")
|
|
50
|
+
|
|
51
|
+
for span in result.spans:
|
|
52
|
+
print(f"[{span.language}] {span.text!r}")
|
|
53
|
+
|
|
54
|
+
# [es] 'no puedo creer'
|
|
55
|
+
# [en] 'how good this restaurant is'
|
|
56
|
+
# [es] 'en serio'
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### French / English
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
result = analyze("je suis tellement tired lately, I need des vacances")
|
|
63
|
+
|
|
64
|
+
for span in result.spans:
|
|
65
|
+
print(f"[{span.language}] {span.text!r}")
|
|
66
|
+
|
|
67
|
+
# [fr] 'je suis tellement tired lately'
|
|
68
|
+
# [en] 'I need'
|
|
69
|
+
# [fr] 'des vacances'
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Mixed with non-Latin scripts
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
result = analyze("this is great هذا رائع جداً I am very impressed")
|
|
76
|
+
|
|
77
|
+
for span in result.spans:
|
|
78
|
+
print(f"[{span.language}] {span.text!r}")
|
|
79
|
+
|
|
80
|
+
# [en] 'this is great'
|
|
81
|
+
# [ar] 'هذا رائع جداً'
|
|
82
|
+
# [en] 'I am very impressed'
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Working with results
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
result = analyze("je suis tellement tired, this week has been rough")
|
|
89
|
+
|
|
90
|
+
result.dominant_language # 'fr'
|
|
91
|
+
result.is_mixed # True
|
|
92
|
+
result.languages # {'fr', 'en'}
|
|
93
|
+
result.confidence # 0.87 (mean confidence across linguistic spans)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Serialise to dict
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
result.to_dict()
|
|
100
|
+
# {
|
|
101
|
+
# 'text': 'je suis tellement tired, this week has been rough',
|
|
102
|
+
# 'spans': [{'text': 'je suis tellement tired', 'language': 'fr', ...}, ...],
|
|
103
|
+
# 'dominant_language': 'fr',
|
|
104
|
+
# 'is_mixed': True,
|
|
105
|
+
# 'confidence': 0.87,
|
|
106
|
+
# ...
|
|
107
|
+
# }
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Serialise to DataFrame
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
# pip install polystring[pandas]
|
|
114
|
+
df = result.to_dataframe()
|
|
115
|
+
# text language token_type confidence start end is_foreign
|
|
116
|
+
# 0 je suis tellement tired fr text 0.91 0 23 False
|
|
117
|
+
# 1 this week has been rough en text 0.84 25 49 True
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### ANSI-coloured terminal output
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
print(result.highlight())
|
|
124
|
+
# [fr]je suis tellement tired [en]this week has been rough
|
|
125
|
+
# (each language rendered in a distinct colour)
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Filter to linguistic spans only
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
result.linguistic_spans()
|
|
132
|
+
# Returns spans with token_type == "text" only (no URLs, emoji, mentions, etc.)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Span fields
|
|
136
|
+
|
|
137
|
+
| Field | Type | Description |
|
|
138
|
+
| ---------------------- | ----------- | --------------------------------------------------------------------------------------------- |
|
|
139
|
+
| `text` | `str` | Text as it appears in the input |
|
|
140
|
+
| `language` | `str` | ISO 639-1 code. `"ur-Latn"` for Roman Urdu, `"und"` for undetermined, `"ne"` for proper nouns |
|
|
141
|
+
| `token_type` | `str` | `"text"`, `"url"`, `"mention"`, `"hashtag"`, `"emoji"`, `"num"`, or `"ne"` |
|
|
142
|
+
| `confidence` | `float` | 0.0 to 1.0. Non-text tokens are always 0.0 |
|
|
143
|
+
| `start` / `end` | `int` | Character offsets into the original string |
|
|
144
|
+
| `is_foreign` | `bool` | `True` if this span is not the dominant language |
|
|
145
|
+
| `ambiguous_candidates` | `list[str]` | Populated when `language == "und"` due to a near-identical pair (e.g. `["es", "pt"]`) |
|
|
146
|
+
|
|
147
|
+
## Language coverage
|
|
148
|
+
|
|
149
|
+
polystring detects **75 languages** via [lingua](https://github.com/pemistahl/lingua-py). Non-Latin scripts (Arabic, Devanagari, CJK, Cyrillic, Thai, Hebrew, Korean, and more) are identified directly from Unicode ranges, no model call needed.
|
|
150
|
+
|
|
151
|
+
Nine languages have dedicated lexicons on top of the model, which significantly improves accuracy on short spans and code-switched text: Roman Urdu / Hinglish, Spanish, Portuguese, Italian, German, Turkish, Tagalog, Swahili
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
import polystring
|
|
155
|
+
print(polystring.supported_languages()) # full list of 75 ISO 639-1 codes
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Options
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
analyze(
|
|
162
|
+
text,
|
|
163
|
+
languages=["es", "en"], # restrict to known language set (faster, fewer false positives)
|
|
164
|
+
granularity="token", # "span" (default) or "token" to get per-word data
|
|
165
|
+
min_confidence=0.70, # tokens below this threshold become "und"
|
|
166
|
+
low_accuracy_mode=False, # lexicon + script detection only, no model (very fast)
|
|
167
|
+
normalize=True, # NFC normalisation
|
|
168
|
+
custom_lexicon={"sw": ["mambo", "vipi"]}, # inject domain-specific words
|
|
169
|
+
)
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### `granularity="token"` gives per-word data
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
result = analyze("bonjour how are you doing", granularity="token")
|
|
176
|
+
|
|
177
|
+
for tok in result.tokens:
|
|
178
|
+
print(f"[{tok.language}] {tok.text!r} ({tok.confidence:.2f})")
|
|
179
|
+
|
|
180
|
+
# [fr] 'bonjour' (0.92)
|
|
181
|
+
# [en] 'how' (0.83)
|
|
182
|
+
# [en] 'are' (0.81)
|
|
183
|
+
# [en] 'you' (0.85)
|
|
184
|
+
# [en] 'doing' (0.88)
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
## Contributing
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
git clone https://github.com/saadlohani/polystring
|
|
191
|
+
cd polystring
|
|
192
|
+
pip install -e ".[dev]"
|
|
193
|
+
pytest
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## License
|
|
197
|
+
|
|
198
|
+
MIT
|