ne-embed 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: ne-embed
3
+ Version: 1.0.0
4
+ Summary: Multilingual text embeddings for Northeast Indian languages
5
+ Author-email: MWire Labs <connect@mwirelabs.com>
6
+ License: CC-BY-4.0
7
+ Project-URL: Homepage, https://mwirelabs.com
8
+ Project-URL: Repository, https://github.com/mwirelabs/ne-embed
9
+ Project-URL: HuggingFace, https://huggingface.co/MWirelabs/ne-embed
10
+ Keywords: embeddings,nlp,northeast-india,low-resource,multilingual,RAG
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Requires-Python: >=3.8
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: sentence-transformers>=2.7.0
16
+ Requires-Dist: numpy>=1.21.0
17
+
18
+ # ne-embed
19
+
20
+ Multilingual text embeddings for Northeast Indian languages.
21
+
22
+ **10 languages · 768 dimensions · Built on LaBSE · CC-BY-4.0**
23
+
24
+ ## Install
25
+
26
+ ```bash
27
+ pip install ne-embed
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ ```python
33
+ from ne_embed import NEEmbed
34
+
35
+ model = NEEmbed() # downloads from HuggingFace on first run
36
+
37
+ sentences = [
38
+ 'Where is the nearest hospital?',
39
+ 'Ngi la pynjot ia ki shnong baroh', # Khasi
40
+ 'Pilakchin an senganiko man na.', # Garo
41
+ ]
42
+
43
+ embeddings = model.encode(sentences)
44
+ print(embeddings.shape) # (3, 768)
45
+
46
+ model.languages() # print supported languages
47
+ ```
48
+
49
+ ## Supported Languages
50
+
51
+ | Code | Language | Tier |
52
+ |------|----------|------|
53
+ | asm | Assamese | Supported |
54
+ | brx | Bodo | Supported |
55
+ | grt | Garo | Supported |
56
+ | kha | Khasi | Supported |
57
+ | lus | Mizo | Supported |
58
+ | mni | Meitei | Supported |
59
+ | njz | Nyishi | Supported |
60
+ | trp | Kokborok | Limited |
61
+ | pbv | Pnar | Limited |
62
+ | nag | Nagamese | Limited |
63
+
64
+ ## Links
65
+ - [HuggingFace](https://huggingface.co/MWirelabs/ne-embed)
66
+ - [MWire Labs](https://mwirelabs.com)
@@ -0,0 +1,49 @@
1
+ # ne-embed
2
+
3
+ Multilingual text embeddings for Northeast Indian languages.
4
+
5
+ **10 languages · 768 dimensions · Built on LaBSE · CC-BY-4.0**
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install ne-embed
11
+ ```
12
+
13
+ ## Usage
14
+
15
+ ```python
16
+ from ne_embed import NEEmbed
17
+
18
+ model = NEEmbed() # downloads from HuggingFace on first run
19
+
20
+ sentences = [
21
+ 'Where is the nearest hospital?',
22
+ 'Ngi la pynjot ia ki shnong baroh', # Khasi
23
+ 'Pilakchin an senganiko man na.', # Garo
24
+ ]
25
+
26
+ embeddings = model.encode(sentences)
27
+ print(embeddings.shape) # (3, 768)
28
+
29
+ model.languages() # print supported languages
30
+ ```
31
+
32
+ ## Supported Languages
33
+
34
+ | Code | Language | Tier |
35
+ |------|----------|------|
36
+ | asm | Assamese | Supported |
37
+ | brx | Bodo | Supported |
38
+ | grt | Garo | Supported |
39
+ | kha | Khasi | Supported |
40
+ | lus | Mizo | Supported |
41
+ | mni | Meitei | Supported |
42
+ | njz | Nyishi | Supported |
43
+ | trp | Kokborok | Limited |
44
+ | pbv | Pnar | Limited |
45
+ | nag | Nagamese | Limited |
46
+
47
+ ## Links
48
+ - [HuggingFace](https://huggingface.co/MWirelabs/ne-embed)
49
+ - [MWire Labs](https://mwirelabs.com)
@@ -0,0 +1,4 @@
1
+ from .model import NEEmbed, SUPPORTED_LANGUAGES
2
+
3
+ __version__ = "1.0.0"
4
+ __all__ = ["NEEmbed", "SUPPORTED_LANGUAGES"]
@@ -0,0 +1,45 @@
1
+ from sentence_transformers import SentenceTransformer
2
+ from typing import List, Union
3
+ import numpy as np
4
+
5
+ MODEL_ID = "MWirelabs/ne-embed"
6
+
7
+ SUPPORTED_LANGUAGES = {
8
+ "asm": "Assamese", "brx": "Bodo", "grt": "Garo",
9
+ "kha": "Khasi", "lus": "Mizo", "mni": "Meitei",
10
+ "njz": "Nyishi", "trp": "Kokborok", "pbv": "Pnar", "nag": "Nagamese"
11
+ }
12
+
13
+ class NEEmbed:
14
+ def __init__(self, device: str = None):
15
+ """Load NE-Embed from HuggingFace (downloads on first use)."""
16
+ print(f"Loading {MODEL_ID} ...")
17
+ self.model = SentenceTransformer(MODEL_ID, device=device)
18
+ self.model.max_seq_length = 128
19
+ print("Ready.")
20
+
21
+ def encode(
22
+ self,
23
+ sentences: Union[str, List[str]],
24
+ batch_size: int = 64,
25
+ normalize: bool = True,
26
+ ) -> np.ndarray:
27
+ """Encode sentences into 768-dim embeddings."""
28
+ if isinstance(sentences, str):
29
+ sentences = [sentences]
30
+ return self.model.encode(
31
+ sentences,
32
+ batch_size=batch_size,
33
+ normalize_embeddings=normalize,
34
+ show_progress_bar=False,
35
+ )
36
+
37
+ def similarity(self, a: np.ndarray, b: np.ndarray) -> np.ndarray:
38
+ """Cosine similarity between two embedding arrays."""
39
+ return (a @ b.T).tolist()
40
+
41
+ @staticmethod
42
+ def languages():
43
+ """Print supported languages."""
44
+ for code, name in SUPPORTED_LANGUAGES.items():
45
+ print(f" {code} {name}")
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: ne-embed
3
+ Version: 1.0.0
4
+ Summary: Multilingual text embeddings for Northeast Indian languages
5
+ Author-email: MWire Labs <connect@mwirelabs.com>
6
+ License: CC-BY-4.0
7
+ Project-URL: Homepage, https://mwirelabs.com
8
+ Project-URL: Repository, https://github.com/mwirelabs/ne-embed
9
+ Project-URL: HuggingFace, https://huggingface.co/MWirelabs/ne-embed
10
+ Keywords: embeddings,nlp,northeast-india,low-resource,multilingual,RAG
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Requires-Python: >=3.8
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: sentence-transformers>=2.7.0
16
+ Requires-Dist: numpy>=1.21.0
17
+
18
+ # ne-embed
19
+
20
+ Multilingual text embeddings for Northeast Indian languages.
21
+
22
+ **10 languages · 768 dimensions · Built on LaBSE · CC-BY-4.0**
23
+
24
+ ## Install
25
+
26
+ ```bash
27
+ pip install ne-embed
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ ```python
33
+ from ne_embed import NEEmbed
34
+
35
+ model = NEEmbed() # downloads from HuggingFace on first run
36
+
37
+ sentences = [
38
+ 'Where is the nearest hospital?',
39
+ 'Ngi la pynjot ia ki shnong baroh', # Khasi
40
+ 'Pilakchin an senganiko man na.', # Garo
41
+ ]
42
+
43
+ embeddings = model.encode(sentences)
44
+ print(embeddings.shape) # (3, 768)
45
+
46
+ model.languages() # print supported languages
47
+ ```
48
+
49
+ ## Supported Languages
50
+
51
+ | Code | Language | Tier |
52
+ |------|----------|------|
53
+ | asm | Assamese | Supported |
54
+ | brx | Bodo | Supported |
55
+ | grt | Garo | Supported |
56
+ | kha | Khasi | Supported |
57
+ | lus | Mizo | Supported |
58
+ | mni | Meitei | Supported |
59
+ | njz | Nyishi | Supported |
60
+ | trp | Kokborok | Limited |
61
+ | pbv | Pnar | Limited |
62
+ | nag | Nagamese | Limited |
63
+
64
+ ## Links
65
+ - [HuggingFace](https://huggingface.co/MWirelabs/ne-embed)
66
+ - [MWire Labs](https://mwirelabs.com)
@@ -0,0 +1,9 @@
1
+ README.md
2
+ pyproject.toml
3
+ ne_embed/__init__.py
4
+ ne_embed/model.py
5
+ ne_embed.egg-info/PKG-INFO
6
+ ne_embed.egg-info/SOURCES.txt
7
+ ne_embed.egg-info/dependency_links.txt
8
+ ne_embed.egg-info/requires.txt
9
+ ne_embed.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ sentence-transformers>=2.7.0
2
+ numpy>=1.21.0
@@ -0,0 +1 @@
1
+ ne_embed
@@ -0,0 +1,26 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "ne-embed"
7
+ version = "1.0.0"
8
+ description = "Multilingual text embeddings for Northeast Indian languages"
9
+ readme = "README.md"
10
+ license = { text = "CC-BY-4.0" }
11
+ authors = [{ name = "MWire Labs", email = "connect@mwirelabs.com" }]
12
+ keywords = ["embeddings", "nlp", "northeast-india", "low-resource", "multilingual", "RAG"]
13
+ classifiers = [
14
+ "Programming Language :: Python :: 3",
15
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
16
+ ]
17
+ requires-python = ">=3.8"
18
+ dependencies = [
19
+ "sentence-transformers>=2.7.0",
20
+ "numpy>=1.21.0",
21
+ ]
22
+
23
+ [project.urls]
24
+ Homepage = "https://mwirelabs.com"
25
+ Repository = "https://github.com/mwirelabs/ne-embed"
26
+ HuggingFace = "https://huggingface.co/MWirelabs/ne-embed"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+