ne-lid 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ne_lid-1.0.0/PKG-INFO +68 -0
- ne_lid-1.0.0/README.md +52 -0
- ne_lid-1.0.0/ne_lid/__init__.py +4 -0
- ne_lid-1.0.0/ne_lid/model.py +41 -0
- ne_lid-1.0.0/ne_lid.egg-info/PKG-INFO +68 -0
- ne_lid-1.0.0/ne_lid.egg-info/SOURCES.txt +9 -0
- ne_lid-1.0.0/ne_lid.egg-info/dependency_links.txt +1 -0
- ne_lid-1.0.0/ne_lid.egg-info/requires.txt +2 -0
- ne_lid-1.0.0/ne_lid.egg-info/top_level.txt +1 -0
- ne_lid-1.0.0/pyproject.toml +25 -0
- ne_lid-1.0.0/setup.cfg +4 -0
ne_lid-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ne-lid
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Language identification for Northeast Indian languages
|
|
5
|
+
Author-email: MWire Labs <connect@mwirelabs.com>
|
|
6
|
+
Project-URL: Homepage, https://mwirelabs.com
|
|
7
|
+
Project-URL: HuggingFace, https://huggingface.co/MWirelabs/ne-lid
|
|
8
|
+
Keywords: language-identification,nlp,northeast-india,low-resource,fasttext
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
11
|
+
Classifier: License :: Free To Use But Restricted
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: fasttext-wheel>=0.9.2
|
|
15
|
+
Requires-Dist: huggingface-hub>=0.20.0
|
|
16
|
+
|
|
17
|
+
# ne-lid
|
|
18
|
+
|
|
19
|
+
Language identification for Northeast Indian languages.
|
|
20
|
+
|
|
21
|
+
**11 languages · 99.09% accuracy · fastText · CC-BY-4.0**
|
|
22
|
+
|
|
23
|
+
## Install
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install ne-lid
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from ne_lid import NELID
|
|
33
|
+
|
|
34
|
+
model = NELID() # downloads from HuggingFace on first run
|
|
35
|
+
|
|
36
|
+
# Single prediction
|
|
37
|
+
result = model.predict('Ki paidbah shnong ki la ia shim bynta')
|
|
38
|
+
print(result) # {'lang': 'kha', 'score': 0.9999}
|
|
39
|
+
|
|
40
|
+
# Top-3 predictions
|
|
41
|
+
results = model.predict('Ka sngi ka lieh', k=3)
|
|
42
|
+
print(results)
|
|
43
|
+
|
|
44
|
+
# List supported languages
|
|
45
|
+
model.languages()
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Supported Languages
|
|
49
|
+
|
|
50
|
+
| Code | Language | Accuracy |
|
|
51
|
+
|------|----------|----------|
|
|
52
|
+
| asm | Assamese | 100% |
|
|
53
|
+
| brx | Bodo | 99% |
|
|
54
|
+
| eng | English | 98% |
|
|
55
|
+
| grt | Garo | 100% |
|
|
56
|
+
| hin | Hindi | 97% |
|
|
57
|
+
| kha | Khasi | 99% |
|
|
58
|
+
| trp | Kokborok | 100% |
|
|
59
|
+
| mni | Meitei | 100% |
|
|
60
|
+
| lus | Mizo | 99% |
|
|
61
|
+
| nag | Nagamese | 100% |
|
|
62
|
+
| njz | Nyishi | 99% |
|
|
63
|
+
|
|
64
|
+
Overall test accuracy: **99.09%**
|
|
65
|
+
|
|
66
|
+
## Links
|
|
67
|
+
- [HuggingFace](https://huggingface.co/MWirelabs/ne-lid)
|
|
68
|
+
- [MWire Labs](https://mwirelabs.com)
|
ne_lid-1.0.0/README.md
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# ne-lid
|
|
2
|
+
|
|
3
|
+
Language identification for Northeast Indian languages.
|
|
4
|
+
|
|
5
|
+
**11 languages · 99.09% accuracy · fastText · CC-BY-4.0**
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install ne-lid
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
from ne_lid import NELID
|
|
17
|
+
|
|
18
|
+
model = NELID() # downloads from HuggingFace on first run
|
|
19
|
+
|
|
20
|
+
# Single prediction
|
|
21
|
+
result = model.predict('Ki paidbah shnong ki la ia shim bynta')
|
|
22
|
+
print(result) # {'lang': 'kha', 'score': 0.9999}
|
|
23
|
+
|
|
24
|
+
# Top-3 predictions
|
|
25
|
+
results = model.predict('Ka sngi ka lieh', k=3)
|
|
26
|
+
print(results)
|
|
27
|
+
|
|
28
|
+
# List supported languages
|
|
29
|
+
model.languages()
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Supported Languages
|
|
33
|
+
|
|
34
|
+
| Code | Language | Accuracy |
|
|
35
|
+
|------|----------|----------|
|
|
36
|
+
| asm | Assamese | 100% |
|
|
37
|
+
| brx | Bodo | 99% |
|
|
38
|
+
| eng | English | 98% |
|
|
39
|
+
| grt | Garo | 100% |
|
|
40
|
+
| hin | Hindi | 97% |
|
|
41
|
+
| kha | Khasi | 99% |
|
|
42
|
+
| trp | Kokborok | 100% |
|
|
43
|
+
| mni | Meitei | 100% |
|
|
44
|
+
| lus | Mizo | 99% |
|
|
45
|
+
| nag | Nagamese | 100% |
|
|
46
|
+
| njz | Nyishi | 99% |
|
|
47
|
+
|
|
48
|
+
Overall test accuracy: **99.09%**
|
|
49
|
+
|
|
50
|
+
## Links
|
|
51
|
+
- [HuggingFace](https://huggingface.co/MWirelabs/ne-lid)
|
|
52
|
+
- [MWire Labs](https://mwirelabs.com)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import fasttext
|
|
2
|
+
import os
|
|
3
|
+
from huggingface_hub import hf_hub_download
|
|
4
|
+
|
|
5
|
+
MODEL_REPO = "MWirelabs/ne-lid"
|
|
6
|
+
MODEL_FILE = "ne_lid.bin"
|
|
7
|
+
|
|
8
|
+
LANGUAGES = {
|
|
9
|
+
"asm": "Assamese",
|
|
10
|
+
"brx": "Bodo",
|
|
11
|
+
"eng": "English",
|
|
12
|
+
"grt": "Garo",
|
|
13
|
+
"hin": "Hindi",
|
|
14
|
+
"kha": "Khasi",
|
|
15
|
+
"trp": "Kokborok",
|
|
16
|
+
"mni": "Meitei",
|
|
17
|
+
"lus": "Mizo",
|
|
18
|
+
"nag": "Nagamese (Naga)",
|
|
19
|
+
"njz": "Nyishi",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
class NELID:
|
|
23
|
+
def __init__(self):
|
|
24
|
+
print("Loading NE-LID...")
|
|
25
|
+
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
|
|
26
|
+
self.model = fasttext.load_model(model_path)
|
|
27
|
+
print("Ready.")
|
|
28
|
+
|
|
29
|
+
def predict(self, text: str, k: int = 1):
|
|
30
|
+
text = text.strip().replace("\n", " ")
|
|
31
|
+
labels, scores = self.model.predict(text, k=k)
|
|
32
|
+
results = [
|
|
33
|
+
{"lang": l.replace("__label__", ""), "score": round(float(s), 4)}
|
|
34
|
+
for l, s in zip(labels, scores)
|
|
35
|
+
]
|
|
36
|
+
return results if k > 1 else results[0]
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def languages():
|
|
40
|
+
for code, name in LANGUAGES.items():
|
|
41
|
+
print(f" {code} {name}")
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ne-lid
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Language identification for Northeast Indian languages
|
|
5
|
+
Author-email: MWire Labs <connect@mwirelabs.com>
|
|
6
|
+
Project-URL: Homepage, https://mwirelabs.com
|
|
7
|
+
Project-URL: HuggingFace, https://huggingface.co/MWirelabs/ne-lid
|
|
8
|
+
Keywords: language-identification,nlp,northeast-india,low-resource,fasttext
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
11
|
+
Classifier: License :: Free To Use But Restricted
|
|
12
|
+
Requires-Python: >=3.8
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: fasttext-wheel>=0.9.2
|
|
15
|
+
Requires-Dist: huggingface-hub>=0.20.0
|
|
16
|
+
|
|
17
|
+
# ne-lid
|
|
18
|
+
|
|
19
|
+
Language identification for Northeast Indian languages.
|
|
20
|
+
|
|
21
|
+
**11 languages · 99.09% accuracy · fastText · CC-BY-4.0**
|
|
22
|
+
|
|
23
|
+
## Install
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install ne-lid
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from ne_lid import NELID
|
|
33
|
+
|
|
34
|
+
model = NELID() # downloads from HuggingFace on first run
|
|
35
|
+
|
|
36
|
+
# Single prediction
|
|
37
|
+
result = model.predict('Ki paidbah shnong ki la ia shim bynta')
|
|
38
|
+
print(result) # {'lang': 'kha', 'score': 0.9999}
|
|
39
|
+
|
|
40
|
+
# Top-3 predictions
|
|
41
|
+
results = model.predict('Ka sngi ka lieh', k=3)
|
|
42
|
+
print(results)
|
|
43
|
+
|
|
44
|
+
# List supported languages
|
|
45
|
+
model.languages()
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Supported Languages
|
|
49
|
+
|
|
50
|
+
| Code | Language | Accuracy |
|
|
51
|
+
|------|----------|----------|
|
|
52
|
+
| asm | Assamese | 100% |
|
|
53
|
+
| brx | Bodo | 99% |
|
|
54
|
+
| eng | English | 98% |
|
|
55
|
+
| grt | Garo | 100% |
|
|
56
|
+
| hin | Hindi | 97% |
|
|
57
|
+
| kha | Khasi | 99% |
|
|
58
|
+
| trp | Kokborok | 100% |
|
|
59
|
+
| mni | Meitei | 100% |
|
|
60
|
+
| lus | Mizo | 99% |
|
|
61
|
+
| nag | Nagamese | 100% |
|
|
62
|
+
| njz | Nyishi | 99% |
|
|
63
|
+
|
|
64
|
+
Overall test accuracy: **99.09%**
|
|
65
|
+
|
|
66
|
+
## Links
|
|
67
|
+
- [HuggingFace](https://huggingface.co/MWirelabs/ne-lid)
|
|
68
|
+
- [MWire Labs](https://mwirelabs.com)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ne_lid
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ne-lid"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Language identification for Northeast Indian languages"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [{ name = "MWire Labs", email = "connect@mwirelabs.com" }]
|
|
11
|
+
keywords = ["language-identification", "nlp", "northeast-india", "low-resource", "fasttext"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Programming Language :: Python :: 3",
|
|
14
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
15
|
+
"License :: Free To Use But Restricted",
|
|
16
|
+
]
|
|
17
|
+
requires-python = ">=3.8"
|
|
18
|
+
dependencies = [
|
|
19
|
+
"fasttext-wheel>=0.9.2",
|
|
20
|
+
"huggingface-hub>=0.20.0",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.urls]
|
|
24
|
+
Homepage = "https://mwirelabs.com"
|
|
25
|
+
HuggingFace = "https://huggingface.co/MWirelabs/ne-lid"
|
ne_lid-1.0.0/setup.cfg
ADDED