ne-embed 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ne_embed/__init__.py +4 -0
- ne_embed/model.py +45 -0
- ne_embed-1.0.0.dist-info/METADATA +66 -0
- ne_embed-1.0.0.dist-info/RECORD +6 -0
- ne_embed-1.0.0.dist-info/WHEEL +5 -0
- ne_embed-1.0.0.dist-info/top_level.txt +1 -0
ne_embed/__init__.py
ADDED
ne_embed/model.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from sentence_transformers import SentenceTransformer
|
|
2
|
+
from typing import List, Union
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
MODEL_ID = "MWirelabs/ne-embed"
|
|
6
|
+
|
|
7
|
+
SUPPORTED_LANGUAGES = {
|
|
8
|
+
"asm": "Assamese", "brx": "Bodo", "grt": "Garo",
|
|
9
|
+
"kha": "Khasi", "lus": "Mizo", "mni": "Meitei",
|
|
10
|
+
"njz": "Nyishi", "trp": "Kokborok", "pbv": "Pnar", "nag": "Nagamese"
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
class NEEmbed:
|
|
14
|
+
def __init__(self, device: str = None):
|
|
15
|
+
"""Load NE-Embed from HuggingFace (downloads on first use)."""
|
|
16
|
+
print(f"Loading {MODEL_ID} ...")
|
|
17
|
+
self.model = SentenceTransformer(MODEL_ID, device=device)
|
|
18
|
+
self.model.max_seq_length = 128
|
|
19
|
+
print("Ready.")
|
|
20
|
+
|
|
21
|
+
def encode(
|
|
22
|
+
self,
|
|
23
|
+
sentences: Union[str, List[str]],
|
|
24
|
+
batch_size: int = 64,
|
|
25
|
+
normalize: bool = True,
|
|
26
|
+
) -> np.ndarray:
|
|
27
|
+
"""Encode sentences into 768-dim embeddings."""
|
|
28
|
+
if isinstance(sentences, str):
|
|
29
|
+
sentences = [sentences]
|
|
30
|
+
return self.model.encode(
|
|
31
|
+
sentences,
|
|
32
|
+
batch_size=batch_size,
|
|
33
|
+
normalize_embeddings=normalize,
|
|
34
|
+
show_progress_bar=False,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
|
|
38
|
+
"""Cosine similarity between two embedding arrays."""
|
|
39
|
+
return (a @ b.T).tolist()
|
|
40
|
+
|
|
41
|
+
@staticmethod
|
|
42
|
+
def languages():
|
|
43
|
+
"""Print supported languages."""
|
|
44
|
+
for code, name in SUPPORTED_LANGUAGES.items():
|
|
45
|
+
print(f" {code} {name}")
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ne-embed
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Multilingual text embeddings for Northeast Indian languages
|
|
5
|
+
Author-email: MWire Labs <connect@mwirelabs.com>
|
|
6
|
+
License: CC-BY-4.0
|
|
7
|
+
Project-URL: Homepage, https://mwirelabs.com
|
|
8
|
+
Project-URL: Repository, https://github.com/mwirelabs/ne-embed
|
|
9
|
+
Project-URL: HuggingFace, https://huggingface.co/MWirelabs/ne-embed
|
|
10
|
+
Keywords: embeddings,nlp,northeast-india,low-resource,multilingual,RAG
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.8
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: sentence-transformers>=2.7.0
|
|
16
|
+
Requires-Dist: numpy>=1.21.0
|
|
17
|
+
|
|
18
|
+
# ne-embed
|
|
19
|
+
|
|
20
|
+
Multilingual text embeddings for Northeast Indian languages.
|
|
21
|
+
|
|
22
|
+
**10 languages · 768 dimensions · Built on LaBSE · CC-BY-4.0**
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install ne-embed
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Usage
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from ne_embed import NEEmbed
|
|
34
|
+
|
|
35
|
+
model = NEEmbed() # downloads from HuggingFace on first run
|
|
36
|
+
|
|
37
|
+
sentences = [
|
|
38
|
+
'Where is the nearest hospital?',
|
|
39
|
+
'Ngi la pynjot ia ki shnong baroh', # Khasi
|
|
40
|
+
'Pilakchin an senganiko man na.', # Garo
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
embeddings = model.encode(sentences)
|
|
44
|
+
print(embeddings.shape) # (3, 768)
|
|
45
|
+
|
|
46
|
+
model.languages() # print supported languages
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Supported Languages
|
|
50
|
+
|
|
51
|
+
| Code | Language | Tier |
|
|
52
|
+
|------|----------|------|
|
|
53
|
+
| asm | Assamese | Supported |
|
|
54
|
+
| brx | Bodo | Supported |
|
|
55
|
+
| grt | Garo | Supported |
|
|
56
|
+
| kha | Khasi | Supported |
|
|
57
|
+
| lus | Mizo | Supported |
|
|
58
|
+
| mni | Meitei | Supported |
|
|
59
|
+
| njz | Nyishi | Supported |
|
|
60
|
+
| trp | Kokborok | Limited |
|
|
61
|
+
| pbv | Pnar | Limited |
|
|
62
|
+
| nag | Nagamese | Limited |
|
|
63
|
+
|
|
64
|
+
## Links
|
|
65
|
+
- [HuggingFace](https://huggingface.co/MWirelabs/ne-embed)
|
|
66
|
+
- [MWire Labs](https://mwirelabs.com)
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
ne_embed/__init__.py,sha256=XT4xM1ID1u3pdbTmPPbMTC1wnHRMd85I1Sm2TPFJa-I,116
|
|
2
|
+
ne_embed/model.py,sha256=HvjdmeWF-Am_8LDBQ0Co6hHiwOj0Etx3nBEdAYZ9Lgg,1456
|
|
3
|
+
ne_embed-1.0.0.dist-info/METADATA,sha256=QAm6-ensSywM5b6ziUGfYHJzDAVcv7FDPRoHxvdUUZs,1736
|
|
4
|
+
ne_embed-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
5
|
+
ne_embed-1.0.0.dist-info/top_level.txt,sha256=np7ZXInB2ZnTOrscUBVPQGRbTkH8LO2bUnZRsomiyoc,9
|
|
6
|
+
ne_embed-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ne_embed
|