ne-embed 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ne_embed/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .model import NEEmbed, SUPPORTED_LANGUAGES
2
+
3
+ __version__ = "1.0.0"
4
+ __all__ = ["NEEmbed", "SUPPORTED_LANGUAGES"]
ne_embed/model.py ADDED
@@ -0,0 +1,45 @@
1
+ from sentence_transformers import SentenceTransformer
2
+ from typing import List, Union
3
+ import numpy as np
4
+
5
+ MODEL_ID = "MWirelabs/ne-embed"
6
+
7
+ SUPPORTED_LANGUAGES = {
8
+ "asm": "Assamese", "brx": "Bodo", "grt": "Garo",
9
+ "kha": "Khasi", "lus": "Mizo", "mni": "Meitei",
10
+ "njz": "Nyishi", "trp": "Kokborok", "pbv": "Pnar", "nag": "Nagamese"
11
+ }
12
+
13
+ class NEEmbed:
14
+ def __init__(self, device: str = None):
15
+ """Load NE-Embed from HuggingFace (downloads on first use)."""
16
+ print(f"Loading {MODEL_ID} ...")
17
+ self.model = SentenceTransformer(MODEL_ID, device=device)
18
+ self.model.max_seq_length = 128
19
+ print("Ready.")
20
+
21
+ def encode(
22
+ self,
23
+ sentences: Union[str, List[str]],
24
+ batch_size: int = 64,
25
+ normalize: bool = True,
26
+ ) -> np.ndarray:
27
+ """Encode sentences into 768-dim embeddings."""
28
+ if isinstance(sentences, str):
29
+ sentences = [sentences]
30
+ return self.model.encode(
31
+ sentences,
32
+ batch_size=batch_size,
33
+ normalize_embeddings=normalize,
34
+ show_progress_bar=False,
35
+ )
36
+
37
+ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
38
+ """Cosine similarity between two embedding arrays."""
39
+ return (a @ b.T).tolist()
40
+
41
+ @staticmethod
42
+ def languages():
43
+ """Print supported languages."""
44
+ for code, name in SUPPORTED_LANGUAGES.items():
45
+ print(f" {code} {name}")
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: ne-embed
3
+ Version: 1.0.0
4
+ Summary: Multilingual text embeddings for Northeast Indian languages
5
+ Author-email: MWire Labs <connect@mwirelabs.com>
6
+ License: CC-BY-4.0
7
+ Project-URL: Homepage, https://mwirelabs.com
8
+ Project-URL: Repository, https://github.com/mwirelabs/ne-embed
9
+ Project-URL: HuggingFace, https://huggingface.co/MWirelabs/ne-embed
10
+ Keywords: embeddings,nlp,northeast-india,low-resource,multilingual,RAG
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Requires-Python: >=3.8
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: sentence-transformers>=2.7.0
16
+ Requires-Dist: numpy>=1.21.0
17
+
18
+ # ne-embed
19
+
20
+ Multilingual text embeddings for Northeast Indian languages.
21
+
22
+ **10 languages · 768 dimensions · Built on LaBSE · CC-BY-4.0**
23
+
24
+ ## Install
25
+
26
+ ```bash
27
+ pip install ne-embed
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ ```python
33
+ from ne_embed import NEEmbed
34
+
35
+ model = NEEmbed() # downloads from HuggingFace on first run
36
+
37
+ sentences = [
38
+ 'Where is the nearest hospital?',
39
+ 'Ngi la pynjot ia ki shnong baroh', # Khasi
40
+ 'Pilakchin an senganiko man na.', # Garo
41
+ ]
42
+
43
+ embeddings = model.encode(sentences)
44
+ print(embeddings.shape) # (3, 768)
45
+
46
+ model.languages() # print supported languages
47
+ ```
48
+
49
+ ## Supported Languages
50
+
51
+ | Code | Language | Tier |
52
+ |------|----------|------|
53
+ | asm | Assamese | Supported |
54
+ | brx | Bodo | Supported |
55
+ | grt | Garo | Supported |
56
+ | kha | Khasi | Supported |
57
+ | lus | Mizo | Supported |
58
+ | mni | Meitei | Supported |
59
+ | njz | Nyishi | Supported |
60
+ | trp | Kokborok | Limited |
61
+ | pbv | Pnar | Limited |
62
+ | nag | Nagamese | Limited |
63
+
64
+ ## Links
65
+ - [HuggingFace](https://huggingface.co/MWirelabs/ne-embed)
66
+ - [MWire Labs](https://mwirelabs.com)
@@ -0,0 +1,6 @@
1
+ ne_embed/__init__.py,sha256=XT4xM1ID1u3pdbTmPPbMTC1wnHRMd85I1Sm2TPFJa-I,116
2
+ ne_embed/model.py,sha256=HvjdmeWF-Am_8LDBQ0Co6hHiwOj0Etx3nBEdAYZ9Lgg,1456
3
+ ne_embed-1.0.0.dist-info/METADATA,sha256=QAm6-ensSywM5b6ziUGfYHJzDAVcv7FDPRoHxvdUUZs,1736
4
+ ne_embed-1.0.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
5
+ ne_embed-1.0.0.dist-info/top_level.txt,sha256=np7ZXInB2ZnTOrscUBVPQGRbTkH8LO2bUnZRsomiyoc,9
6
+ ne_embed-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ ne_embed