@fidel-tools/core 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +56 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.js +12 -0
- package/dist/index.js.map +1 -0
- package/dist/indexer.d.ts +18 -0
- package/dist/indexer.js +107 -0
- package/dist/indexer.js.map +1 -0
- package/dist/lexical_analyzer.d.ts +9 -0
- package/dist/lexical_analyzer.js +22 -0
- package/dist/lexical_analyzer.js.map +1 -0
- package/dist/normalizer.d.ts +11 -0
- package/dist/normalizer.js +41 -0
- package/dist/normalizer.js.map +1 -0
- package/dist/pipeline.d.ts +19 -0
- package/dist/pipeline.js +44 -0
- package/dist/pipeline.js.map +1 -0
- package/dist/sentence_tokenizer.d.ts +10 -0
- package/dist/sentence_tokenizer.js +22 -0
- package/dist/sentence_tokenizer.js.map +1 -0
- package/dist/stemmer.d.ts +12 -0
- package/dist/stemmer.js +86 -0
- package/dist/stemmer.js.map +1 -0
- package/dist/stopword_remover.d.ts +12 -0
- package/dist/stopword_remover.js +33 -0
- package/dist/stopword_remover.js.map +1 -0
- package/dist/term_weighter.d.ts +4 -0
- package/dist/term_weighter.js +71 -0
- package/dist/term_weighter.js.map +1 -0
- package/dist/transliterator.d.ts +20 -0
- package/dist/transliterator.js +88 -0
- package/dist/transliterator.js.map +1 -0
- package/dist/types.d.ts +54 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/package.json +44 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 Liul Alemayehu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# @fidel-tools/core
|
|
2
|
+
|
|
3
|
+
The core NLP pipeline and text pre-processing engine for Amharic and Ethiopic script processing.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Normalization**: Standardizes homophones, expands labialized strings, and collapses character gemination.
|
|
10
|
+
- **Tokenization**: Standard and sentence-level tokenizers with exception mapping (abbreviation expansion).
|
|
11
|
+
- **Stopwords**: Morphology-aware boundary filtering that removes stopwords safely without corrupting base stems.
|
|
12
|
+
- **Light Stemmer**: Prefix- and suffix-removal algorithms for root extraction.
|
|
13
|
+
- **Transliteration**: Bidirectional SERA and Felig ASCII transliterators.
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pnpm add @fidel-tools/core
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## API & Usage
|
|
26
|
+
|
|
27
|
+
### 1. Unified Pipeline
|
|
28
|
+
|
|
29
|
+
```typescript
|
|
30
|
+
import { Pipeline } from '@fidel-tools/core'
|
|
31
|
+
import amPack from '@fidel-tools/lang-am' // Or your custom pack
|
|
32
|
+
|
|
33
|
+
const nlp = new Pipeline(amPack)
|
|
34
|
+
|
|
35
|
+
const normalized = nlp.normalize("ሐኪም ኀይሉ")
|
|
36
|
+
const tokens = nlp.lexAnalyze("ት/ቤት እና መስሪያ ቤት")
|
|
37
|
+
const stemmed = nlp.stem("ልጆቻቸውን")
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### 2. Low-Level Component Exports
|
|
41
|
+
|
|
42
|
+
```typescript
|
|
43
|
+
import { normalize, sentenceTokenize, removeStopwords, stem } from '@fidel-tools/core'
|
|
44
|
+
import amPack from '@fidel-tools/lang-am'
|
|
45
|
+
|
|
46
|
+
// Individual functional components
|
|
47
|
+
const text = normalize("ሐኪም ኀይሉ", amPack)
|
|
48
|
+
const sentences = sentenceTokenize("ይህ ዓረፍተ ነገር ነው። ያኛው ደግሞ፡", amPack)
|
|
49
|
+
const cleaned = removeStopwords("እና በመሆኑም ትምህርት", amPack)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## License
|
|
55
|
+
|
|
56
|
+
[MIT License](../../LICENSE)
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
export type { LanguagePack, LanguagePackMeta, StemmerConfig, TransliterationConfig } from './types.js';
|
|
2
|
+
export { Pipeline } from './pipeline.js';
|
|
3
|
+
export { normalize } from './normalizer.js';
|
|
4
|
+
export { sentenceTokenize } from './sentence_tokenizer.js';
|
|
5
|
+
export { stem } from './stemmer.js';
|
|
6
|
+
export { removeStopwords } from './stopword_remover.js';
|
|
7
|
+
export { lexAnalyze } from './lexical_analyzer.js';
|
|
8
|
+
export { felig_transliterate, sera_transliterate } from './transliterator.js';
|
|
9
|
+
export { indexDocuments, indexQuery, indexTerms } from './indexer.js';
|
|
10
|
+
export { weighTerms, weigh_terms } from './term_weighter.js';
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
// Pipeline (primary API)
|
|
2
|
+
export { Pipeline } from './pipeline.js';
|
|
3
|
+
// Individual functions (secondary API, for tree-shaking)
|
|
4
|
+
export { normalize } from './normalizer.js';
|
|
5
|
+
export { sentenceTokenize } from './sentence_tokenizer.js';
|
|
6
|
+
export { stem } from './stemmer.js';
|
|
7
|
+
export { removeStopwords } from './stopword_remover.js';
|
|
8
|
+
export { lexAnalyze } from './lexical_analyzer.js';
|
|
9
|
+
export { felig_transliterate, sera_transliterate } from './transliterator.js';
|
|
10
|
+
export { indexDocuments, indexQuery, indexTerms } from './indexer.js';
|
|
11
|
+
export { weighTerms, weigh_terms } from './term_weighter.js';
|
|
12
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAGA,yBAAyB;AACzB,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAA;AAExC,yDAAyD;AACzD,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAA;AAC3C,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAA;AAC1D,OAAO,EAAE,IAAI,EAAE,MAAM,cAAc,CAAA;AACnC,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAA;AACvD,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAA;AAClD,OAAO,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAA;AAC7E,OAAO,EAAE,cAAc,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,cAAc,CAAA;AACrE,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAA"}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { LanguagePack } from './types.js';
|
|
2
|
+
export interface DocIndexData {
|
|
3
|
+
corpus_size: number;
|
|
4
|
+
corpus_word_count: Record<string, number>;
|
|
5
|
+
words: Record<string, Array<Record<string, number>>>;
|
|
6
|
+
}
|
|
7
|
+
export interface QueryIndexData {
|
|
8
|
+
corpus_size: number;
|
|
9
|
+
corpus_word_count: number;
|
|
10
|
+
words: Record<string, number>;
|
|
11
|
+
}
|
|
12
|
+
export declare function indexDocuments(docs: Array<{
|
|
13
|
+
id: string;
|
|
14
|
+
content: string;
|
|
15
|
+
}>, pack: LanguagePack): DocIndexData;
|
|
16
|
+
export declare function indexQuery(query: string, pack: LanguagePack): QueryIndexData;
|
|
17
|
+
export declare function indexTerms(corpus: string[], outputIndexFilePath: string, type: "doc" | "query", pack: LanguagePack): Promise<void>;
|
|
18
|
+
export default indexTerms;
|
package/dist/indexer.js
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import stem from "./stemmer.js";
|
|
2
|
+
import lexAnalyze from "./lexical_analyzer.js";
|
|
3
|
+
import rmvStopwrd from "./stopword_remover.js";
|
|
4
|
+
// Pure functions — no fs dependency
|
|
5
|
+
export function indexDocuments(docs, pack) {
|
|
6
|
+
const indexData = {
|
|
7
|
+
corpus_size: docs.length,
|
|
8
|
+
corpus_word_count: {},
|
|
9
|
+
words: {}
|
|
10
|
+
};
|
|
11
|
+
docs.forEach((doc) => {
|
|
12
|
+
indexData.corpus_word_count[doc.id] = doc.content.split(" ").length;
|
|
13
|
+
// preprocess
|
|
14
|
+
const unStemmedWords = rmvStopwrd(lexAnalyze(doc.content, pack), pack).split(" ");
|
|
15
|
+
const stemmedWords = unStemmedWords.map((word) => stem(word, pack));
|
|
16
|
+
const result = stemmedWords
|
|
17
|
+
.filter((e) => e)
|
|
18
|
+
.filter((e) => {
|
|
19
|
+
return e.length > 1;
|
|
20
|
+
});
|
|
21
|
+
// index
|
|
22
|
+
let wordFlag = 0;
|
|
23
|
+
result.forEach((word) => {
|
|
24
|
+
if (word in indexData.words) {
|
|
25
|
+
indexData.words[word].forEach((pathObj) => {
|
|
26
|
+
if (doc.id in pathObj) {
|
|
27
|
+
pathObj[doc.id]++;
|
|
28
|
+
wordFlag = 1;
|
|
29
|
+
}
|
|
30
|
+
});
|
|
31
|
+
if (wordFlag === 0) {
|
|
32
|
+
indexData.words[word].push({ [doc.id]: 1 });
|
|
33
|
+
}
|
|
34
|
+
else {
|
|
35
|
+
wordFlag = 0;
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
else {
|
|
39
|
+
indexData.words[word] = [{ [doc.id]: 1 }];
|
|
40
|
+
}
|
|
41
|
+
});
|
|
42
|
+
});
|
|
43
|
+
return indexData;
|
|
44
|
+
}
|
|
45
|
+
export function indexQuery(query, pack) {
|
|
46
|
+
const indexData = {
|
|
47
|
+
corpus_size: 1,
|
|
48
|
+
corpus_word_count: query.split(" ").length,
|
|
49
|
+
words: {}
|
|
50
|
+
};
|
|
51
|
+
// preprocess
|
|
52
|
+
const unStemmedWords = rmvStopwrd(lexAnalyze(query, pack), pack).split(" ");
|
|
53
|
+
const stemmedWords = unStemmedWords.map((word) => stem(word, pack));
|
|
54
|
+
const result = stemmedWords
|
|
55
|
+
.filter((e) => e)
|
|
56
|
+
.filter((e) => {
|
|
57
|
+
return e.length > 1;
|
|
58
|
+
});
|
|
59
|
+
// index
|
|
60
|
+
result.forEach((word) => {
|
|
61
|
+
if (word in indexData.words) {
|
|
62
|
+
indexData.words[word]++;
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
indexData.words[word] = 1;
|
|
66
|
+
}
|
|
67
|
+
});
|
|
68
|
+
return indexData;
|
|
69
|
+
}
|
|
70
|
+
// Backwards-compat Node.js wrapper — fs lives here only
|
|
71
|
+
export async function indexTerms(corpus, outputIndexFilePath, type, pack) {
|
|
72
|
+
const fs = await import("fs");
|
|
73
|
+
if (type === "doc") {
|
|
74
|
+
const docs = corpus.map(filePath => {
|
|
75
|
+
try {
|
|
76
|
+
const content = fs.readFileSync(filePath, 'utf8');
|
|
77
|
+
return { id: filePath, content };
|
|
78
|
+
}
|
|
79
|
+
catch (error) {
|
|
80
|
+
console.log(`Error reading ${filePath} file from disk:`, error);
|
|
81
|
+
return { id: filePath, content: "" };
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
const result = indexDocuments(docs, pack);
|
|
85
|
+
try {
|
|
86
|
+
fs.writeFileSync(outputIndexFilePath + '/docIndexFile.json', JSON.stringify(result, null, 2));
|
|
87
|
+
docs.forEach(doc => {
|
|
88
|
+
console.log(`Contents of ${doc.id} successfully added to index`);
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
catch (error) {
|
|
92
|
+
console.log("Index creation failed", error);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
else {
|
|
96
|
+
try {
|
|
97
|
+
const result = indexQuery(corpus, pack);
|
|
98
|
+
fs.writeFileSync(outputIndexFilePath + '/queryIndexFile.json', JSON.stringify(result, null, 2));
|
|
99
|
+
console.log(`Contents of Query successfully added to index`);
|
|
100
|
+
}
|
|
101
|
+
catch (error) {
|
|
102
|
+
console.log("Index creation failed", error);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
export default indexTerms;
|
|
107
|
+
//# sourceMappingURL=indexer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"indexer.js","sourceRoot":"","sources":["../src/indexer.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,cAAc,CAAA;AAC/B,OAAO,UAAU,MAAM,uBAAuB,CAAA;AAC9C,OAAO,UAAU,MAAM,uBAAuB,CAAA;AAgB9C,oCAAoC;AACpC,MAAM,UAAU,cAAc,CAC5B,IAA4C,EAC5C,IAAkB;IAElB,MAAM,SAAS,GAAiB;QAC9B,WAAW,EAAE,IAAI,CAAC,MAAM;QACxB,iBAAiB,EAAE,EAAE;QACrB,KAAK,EAAE,EAAE;KACV,CAAA;IAED,IAAI,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE;QACnB,SAAS,CAAC,iBAAiB,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAA;QAEnE,aAAa;QACb,MAAM,cAAc,GAAG,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;QACjF,MAAM,YAAY,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;QACnE,MAAM,MAAM,GAAG,YAAY;aACxB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;aAChB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACZ,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,CAAA;QACrB,CAAC,CAAC,CAAA;QAEJ,QAAQ;QACR,IAAI,QAAQ,GAAG,CAAC,CAAA;QAChB,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;YACtB,IAAI,IAAI,IAAI,SAAS,CAAC,KAAK,EAAE,CAAC;gBAC5B,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;oBACxC,IAAI,GAAG,CAAC,EAAE,IAAI,OAAO,EAAE,CAAC;wBACtB,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAA;wBACjB,QAAQ,GAAG,CAAC,CAAA;oBACd,CAAC;gBACH,CAAC,CAAC,CAAA;gBACF,IAAI,QAAQ,KAAK,CAAC,EAAE,CAAC;oBACnB,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAA;gBAC7C,CAAC;qBAAM,CAAC;oBACN,QAAQ,GAAG,CAAC,CAAA;gBACd,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAA;YAC3C,CAAC;QACH,CAAC,CAAC,CAAA;IACJ,CAAC,CAAC,CAAA;IAEF,OAAO,SAAS,CAAA;AAClB,CAAC;AAED,MAAM,UAAU,UAAU,CACxB,KAAa,EACb,IAAkB;IAElB,MAAM,SAAS,GAAmB;QAChC,WAAW,EAAE,CAAC;QACd,iBAAiB,EAAE,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM;QAC1C,KAAK,EAAE,EAAE;KACV,CAAA;IAED,aAAa;IACb,MAAM,cAAc,GAAG,UAAU,CAAC,UAAU,CAAC,KAAK,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;IAC3E,MAAM,YAAY,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IACnE,MAAM,MAAM,GAAG,YAAY;SACxB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;SAChB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;QACZ,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,CAAA;IACrB,CAAC,CAAC,CAAA;IAEJ,QAAQ;IACR,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;QACtB,IAAI,IAAI,IAAI,SAAS,CAAC,KAAK,EAAE,CAAC;YAC5B,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAA;QACzB,CAAC;aAAM,CAAC;YACN,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QAC3B,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,OAAO,SAAS,CAAA;AAClB,CAAC;AAED,wDAAwD;AACxD,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,MAAgB,EAChB,mBAA2B,EAC3B,IAAqB,EACrB,IAAkB;IAElB,MAAM,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAA;IAC7B,IAAI,IAAI,KAAK,KAAK,EAAE,CAAC;QACnB,MAAM,IAAI,GAAG,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE;YACjC,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAA;gBACjD,OAAO,EAAE,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAA;YAClC,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,GAAG,CAAC,iBAAiB,QAAQ,kBAAkB,EAAE,KAAK,CAAC,CAAA;gBAC/D,OAAO,EAAE,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,EAAE,EAAE,CAAA;YACtC,CAAC;QACH,CAAC,CAAC,CAAA;QACF,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,CAAA;QACzC,IAAI,CAAC;YACH,EAAE,CAAC,aAAa,CAAC,mBAAmB,GAAG,oBAAoB,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAA;YAC7F,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;gBACjB,OAAO,CAAC,GAAG,CAAC,eAAe,GAAG,CAAC,EAAE,8BAA8B,CAAC,CAAA;YAClE,CAAC,CAAC,CAAA;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,GAAG,CAAC,uBAAuB,EAAE,KAAK,CAAC,CAAA;QAC7C,CAAC;IACH,CAAC;SAAM,CAAC;QACN,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,UAAU,CAAC,MAA2B,EAAE,IAAI,CAAC,CAAA;YAC5D,EAAE,CAAC,aAAa,CAAC,mBAAmB,GAAG,sBAAsB,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAA;YAC/F,OAAO,CAAC,GAAG,CAAC,+CAA+C,CAAC,CAAA;QAC9D,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,GAAG,CAAC,uBAAuB,EAAE,KAAK,CAAC,CAAA;QAC7C,CAAC;IACH,CAAC;AACH,CAAC;AAED,eAAe,UAAU,CAAA"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { LanguagePack } from './types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Separates words, expands abbreviations, removes numbers, breaks up hyphenated words, and removes punctuation
|
|
4
|
+
* @param corpus : Amharic text
|
|
5
|
+
* @param pack : language pack configuration
|
|
6
|
+
* @returns : Lexically analyzed Amharic text
|
|
7
|
+
*/
|
|
8
|
+
export declare function lexAnalyze(corpus: string, pack: LanguagePack): string;
|
|
9
|
+
export default lexAnalyze;
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Separates words, expands abbreviations, removes numbers, breaks up hyphenated words, and removes punctuation
|
|
3
|
+
* @param corpus : Amharic text
|
|
4
|
+
* @param pack : language pack configuration
|
|
5
|
+
* @returns : Lexically analyzed Amharic text
|
|
6
|
+
*/
|
|
7
|
+
export function lexAnalyze(corpus, pack) {
|
|
8
|
+
// Expand exceptions (abbreviations)
|
|
9
|
+
if (pack.tokenization && pack.tokenization.exceptions) {
|
|
10
|
+
for (const key in pack.tokenization.exceptions) {
|
|
11
|
+
const expansion = pack.tokenization.exceptions[key].join(" ");
|
|
12
|
+
corpus = corpus.replaceAll(key, expansion);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
corpus = corpus
|
|
16
|
+
.replace(/[.\?"',/#!$%^&*;:፤።{}=\-_`~()]/g, " ")
|
|
17
|
+
.replace(/[.፩፪፫፬፭፮፯፰፱፲፳፴፵፶፷፸፹፺፻0123456789]/g, " ")
|
|
18
|
+
.replace(/\s{2,}/g, " ");
|
|
19
|
+
return corpus;
|
|
20
|
+
}
|
|
21
|
+
export default lexAnalyze;
|
|
22
|
+
//# sourceMappingURL=lexical_analyzer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"lexical_analyzer.js","sourceRoot":"","sources":["../src/lexical_analyzer.ts"],"names":[],"mappings":"AAGA;;;;;GAKG;AACH,MAAM,UAAU,UAAU,CAAC,MAAc,EAAE,IAAkB;IAC3D,oCAAoC;IACpC,IAAI,IAAI,CAAC,YAAY,IAAI,IAAI,CAAC,YAAY,CAAC,UAAU,EAAE,CAAC;QACtD,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,YAAY,CAAC,UAAU,EAAE,CAAC;YAC/C,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;YAC7D,MAAM,GAAG,MAAM,CAAC,UAAU,CAAC,GAAG,EAAE,SAAS,CAAC,CAAA;QAC5C,CAAC;IACH,CAAC;IAED,MAAM,GAAG,MAAM;SACZ,OAAO,CAAC,iCAAiC,EAAE,GAAG,CAAC;SAC/C,OAAO,CAAC,mCAAmC,EAAE,GAAG,CAAC;SACjD,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAA;IAE1B,OAAO,MAAM,CAAA;AACf,CAAC;AAED,eAAe,UAAU,CAAA"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { LanguagePack } from './types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Normalizes Amharic text by applying character mapping,
|
|
4
|
+
* labialized sequence normalization, and gemination collapse.
|
|
5
|
+
*
|
|
6
|
+
* @param text The input string to normalize.
|
|
7
|
+
* @param pack The language pack containing normalization configuration.
|
|
8
|
+
* @returns The normalized string.
|
|
9
|
+
*/
|
|
10
|
+
export declare function normalize(text: string, pack: LanguagePack): string;
|
|
11
|
+
export default normalize;
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Normalizes Amharic text by applying character mapping,
|
|
3
|
+
* labialized sequence normalization, and gemination collapse.
|
|
4
|
+
*
|
|
5
|
+
* @param text The input string to normalize.
|
|
6
|
+
* @param pack The language pack containing normalization configuration.
|
|
7
|
+
* @returns The normalized string.
|
|
8
|
+
*/
|
|
9
|
+
export function normalize(text, pack) {
|
|
10
|
+
if (!pack.normalization) {
|
|
11
|
+
return text;
|
|
12
|
+
}
|
|
13
|
+
let normalized = text;
|
|
14
|
+
// 1. Apply char_map
|
|
15
|
+
const charMap = pack.normalization.char_map || {};
|
|
16
|
+
let chars = normalized.split("");
|
|
17
|
+
for (let i = 0; i < chars.length; i++) {
|
|
18
|
+
if (charMap[chars[i]] !== undefined) {
|
|
19
|
+
chars[i] = charMap[chars[i]];
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
normalized = chars.join("");
|
|
23
|
+
// 2. Apply labialized_map
|
|
24
|
+
const labializedMap = pack.normalization.labialized_map || {};
|
|
25
|
+
let chars2 = normalized.split("");
|
|
26
|
+
for (let i = 0; i < chars2.length; i++) {
|
|
27
|
+
if (labializedMap[chars2[i]] !== undefined) {
|
|
28
|
+
chars2[i] = labializedMap[chars2[i]];
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
normalized = chars2.join("");
|
|
32
|
+
// 3. Collapse gemination
|
|
33
|
+
const threshold = pack.normalization.gemination_threshold;
|
|
34
|
+
if (threshold !== undefined && threshold > 0) {
|
|
35
|
+
const regex = new RegExp(`([^\\s])\\1{${threshold},}`, 'g');
|
|
36
|
+
normalized = normalized.replace(regex, (match, p1) => p1.repeat(threshold));
|
|
37
|
+
}
|
|
38
|
+
return normalized;
|
|
39
|
+
}
|
|
40
|
+
export default normalize;
|
|
41
|
+
//# sourceMappingURL=normalizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"normalizer.js","sourceRoot":"","sources":["../src/normalizer.ts"],"names":[],"mappings":"AAEA;;;;;;;GAOG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY,EAAE,IAAkB;IACxD,IAAI,CAAC,IAAI,CAAC,aAAa,EAAE,CAAC;QACxB,OAAO,IAAI,CAAA;IACb,CAAC;IAED,IAAI,UAAU,GAAG,IAAI,CAAA;IAErB,oBAAoB;IACpB,MAAM,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC,QAAQ,IAAI,EAAE,CAAA;IACjD,IAAI,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC,CAAA;IAChC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,IAAI,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,SAAS,EAAE,CAAC;YACpC,KAAK,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAA;QAC9B,CAAC;IACH,CAAC;IACD,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IAE3B,0BAA0B;IAC1B,MAAM,aAAa,GAAG,IAAI,CAAC,aAAa,CAAC,cAAc,IAAI,EAAE,CAAA;IAC7D,IAAI,MAAM,GAAG,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC,CAAA;IACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,IAAI,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,KAAK,SAAS,EAAE,CAAC;YAC3C,MAAM,CAAC,CAAC,CAAC,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAA;QACtC,CAAC;IACH,CAAC;IACD,UAAU,GAAG,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IAE5B,yBAAyB;IACzB,MAAM,SAAS,GAAG,IAAI,CAAC,aAAa,CAAC,oBAAoB,CAAA;IACzD,IAAI,SAAS,KAAK,SAAS,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;QAC7C,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,eAAe,SAAS,IAAI,EAAE,GAAG,CAAC,CAAA;QAC3D,UAAU,GAAG,UAAU,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,KAAK,EAAE,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAA;IAC7E,CAAC;IAED,OAAO,UAAU,CAAA;AACnB,CAAC;AAED,eAAe,SAAS,CAAA"}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import type { LanguagePack } from './types.js';
|
|
2
|
+
import type { DocIndexData, QueryIndexData } from './indexer.js';
|
|
3
|
+
export declare class Pipeline {
|
|
4
|
+
private pack;
|
|
5
|
+
constructor(pack: LanguagePack);
|
|
6
|
+
normalize(text: string): string;
|
|
7
|
+
sentenceTokenize(text: string): string[];
|
|
8
|
+
stem(word: string): string;
|
|
9
|
+
removeStopwords(corpus: string): string;
|
|
10
|
+
lexAnalyze(corpus: string): string;
|
|
11
|
+
feligTransliterate(word: string, lang: "am" | "en"): string;
|
|
12
|
+
seraTransliterate(word: string, lang: "am" | "en"): string;
|
|
13
|
+
indexDocuments(docs: Array<{
|
|
14
|
+
id: string;
|
|
15
|
+
content: string;
|
|
16
|
+
}>): DocIndexData;
|
|
17
|
+
indexQuery(query: string): QueryIndexData;
|
|
18
|
+
weighTerms(index: DocIndexData | QueryIndexData, type: "doc" | "query"): Record<string, any>;
|
|
19
|
+
}
|
package/dist/pipeline.js
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { stem } from './stemmer.js';
|
|
2
|
+
import { removeStopwords } from './stopword_remover.js';
|
|
3
|
+
import { lexAnalyze } from './lexical_analyzer.js';
|
|
4
|
+
import { felig_transliterate, sera_transliterate } from './transliterator.js';
|
|
5
|
+
import { indexDocuments, indexQuery } from './indexer.js';
|
|
6
|
+
import { weighTerms } from './term_weighter.js';
|
|
7
|
+
import { normalize } from './normalizer.js';
|
|
8
|
+
import { sentenceTokenize } from './sentence_tokenizer.js';
|
|
9
|
+
export class Pipeline {
|
|
10
|
+
constructor(pack) {
|
|
11
|
+
this.pack = pack;
|
|
12
|
+
}
|
|
13
|
+
normalize(text) {
|
|
14
|
+
return normalize(text, this.pack);
|
|
15
|
+
}
|
|
16
|
+
sentenceTokenize(text) {
|
|
17
|
+
return sentenceTokenize(text, this.pack);
|
|
18
|
+
}
|
|
19
|
+
stem(word) {
|
|
20
|
+
return stem(word, this.pack);
|
|
21
|
+
}
|
|
22
|
+
removeStopwords(corpus) {
|
|
23
|
+
return removeStopwords(corpus, this.pack);
|
|
24
|
+
}
|
|
25
|
+
lexAnalyze(corpus) {
|
|
26
|
+
return lexAnalyze(corpus, this.pack);
|
|
27
|
+
}
|
|
28
|
+
feligTransliterate(word, lang) {
|
|
29
|
+
return felig_transliterate(word, lang, this.pack);
|
|
30
|
+
}
|
|
31
|
+
seraTransliterate(word, lang) {
|
|
32
|
+
return sera_transliterate(word, lang, this.pack);
|
|
33
|
+
}
|
|
34
|
+
indexDocuments(docs) {
|
|
35
|
+
return indexDocuments(docs, this.pack);
|
|
36
|
+
}
|
|
37
|
+
indexQuery(query) {
|
|
38
|
+
return indexQuery(query, this.pack);
|
|
39
|
+
}
|
|
40
|
+
weighTerms(index, type) {
|
|
41
|
+
return weighTerms(index, type);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
//# sourceMappingURL=pipeline.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,IAAI,EAAE,MAAM,cAAc,CAAA;AACnC,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAA;AACvD,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAA;AAClD,OAAO,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAA;AAC7E,OAAO,EAAE,cAAc,EAAE,UAAU,EAAE,MAAM,cAAc,CAAA;AACzD,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAA;AAE/C,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAA;AAC3C,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAA;AAE1D,MAAM,OAAO,QAAQ;IACnB,YAAoB,IAAkB;QAAlB,SAAI,GAAJ,IAAI,CAAc;IAAG,CAAC;IAE1C,SAAS,CAAC,IAAY;QACpB,OAAO,SAAS,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IACnC,CAAC;IAED,gBAAgB,CAAC,IAAY;QAC3B,OAAO,gBAAgB,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IAC1C,CAAC;IAED,IAAI,CAAC,IAAY;QACf,OAAO,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IAC9B,CAAC;IACD,eAAe,CAAC,MAAc;QAC5B,OAAO,eAAe,CAAC,MAAM,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IAC3C,CAAC;IACD,UAAU,CAAC,MAAc;QACvB,OAAO,UAAU,CAAC,MAAM,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IACtC,CAAC;IACD,kBAAkB,CAAC,IAAY,EAAE,IAAiB;QAChD,OAAO,mBAAmB,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IACnD,CAAC;IACD,iBAAiB,CAAC,IAAY,EAAE,IAAiB;QAC/C,OAAO,kBAAkB,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IAClD,CAAC;IACD,cAAc,CAAC,IAA4C;QACzD,OAAO,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IACxC,CAAC;IACD,UAAU,CAAC,KAAa;QACtB,OAAO,UAAU,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IACrC,CAAC;IACD,UAAU,CAAC,KAAoC,EAAE,IAAqB;QACpE,OAAO,UAAU,CAAC,KAAK,EAAE,IAAI,CAAC,CAAA;IAChC,CAAC;CACF"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { LanguagePack } from './types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Tokenizes Amharic text into sentences using configured sentence boundaries.
|
|
4
|
+
*
|
|
5
|
+
* @param text The input string to split into sentences.
|
|
6
|
+
* @param pack The language pack containing tokenization configuration.
|
|
7
|
+
* @returns An array of sentence strings.
|
|
8
|
+
*/
|
|
9
|
+
export declare function sentenceTokenize(text: string, pack: LanguagePack): string[];
|
|
10
|
+
export default sentenceTokenize;
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Tokenizes Amharic text into sentences using configured sentence boundaries.
|
|
3
|
+
*
|
|
4
|
+
* @param text The input string to split into sentences.
|
|
5
|
+
* @param pack The language pack containing tokenization configuration.
|
|
6
|
+
* @returns An array of sentence strings.
|
|
7
|
+
*/
|
|
8
|
+
export function sentenceTokenize(text, pack) {
|
|
9
|
+
const boundaries = pack.tokenization?.sentence_boundaries || ["።", "፡", "?", "!", "."];
|
|
10
|
+
if (boundaries.length === 0) {
|
|
11
|
+
return [text];
|
|
12
|
+
}
|
|
13
|
+
// Escape boundaries for Regex character class
|
|
14
|
+
const escaped = boundaries.map(b => b.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('');
|
|
15
|
+
const regex = new RegExp(`[${escaped}]+`, 'g');
|
|
16
|
+
// Split on boundaries, trim whitespace, filter out empty sentences
|
|
17
|
+
return text.split(regex)
|
|
18
|
+
.map(s => s.trim())
|
|
19
|
+
.filter(s => s.length > 0);
|
|
20
|
+
}
|
|
21
|
+
export default sentenceTokenize;
|
|
22
|
+
//# sourceMappingURL=sentence_tokenizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"sentence_tokenizer.js","sourceRoot":"","sources":["../src/sentence_tokenizer.ts"],"names":[],"mappings":"AAEA;;;;;;GAMG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY,EAAE,IAAkB;IAC/D,MAAM,UAAU,GAAG,IAAI,CAAC,YAAY,EAAE,mBAAmB,IAAI,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAA;IACtF,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC5B,OAAO,CAAC,IAAI,CAAC,CAAA;IACf,CAAC;IAED,8CAA8C;IAC9C,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IACtF,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,IAAI,OAAO,IAAI,EAAE,GAAG,CAAC,CAAA;IAE9C,mEAAmE;IACnE,OAAO,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC;SACrB,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;SAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AAC9B,CAAC;AAED,eAAe,gBAAgB,CAAA"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { LanguagePack } from "./types.js";
|
|
2
|
+
/**
|
|
3
|
+
* Takes an Amharic word and returns the stem through affix-removal with longest match.
|
|
4
|
+
* @param word : word possibly containing one or more affix
|
|
5
|
+
* @param pack : the language pack configuration
|
|
6
|
+
* @returns : the stem of the word passed
|
|
7
|
+
*
|
|
8
|
+
* @example {stem word with affix}
|
|
9
|
+
* stem("ልጆቻቸውን", amPack) // returns "ልጅ"
|
|
10
|
+
*/
|
|
11
|
+
export declare function stem(word: string, pack: LanguagePack): string;
|
|
12
|
+
export default stem;
|
package/dist/stemmer.js
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
// Takes Amharic language words and produces a stem
|
|
2
|
+
// ልጆች -> ልጅኦች -> ljoc -> lj -> ልጅ
|
|
3
|
+
import { felig_transliterate } from "./transliterator.js";
|
|
4
|
+
function escapeRegExp(str) {
|
|
5
|
+
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Takes an Amharic word and returns the stem through affix-removal with longest match.
|
|
9
|
+
* @param word : word possibly containing one or more affix
|
|
10
|
+
* @param pack : the language pack configuration
|
|
11
|
+
* @returns : the stem of the word passed
|
|
12
|
+
*
|
|
13
|
+
* @example {stem word with affix}
|
|
14
|
+
* stem("ልጆቻቸውን", amPack) // returns "ልጅ"
|
|
15
|
+
*/
|
|
16
|
+
export function stem(word, pack) {
|
|
17
|
+
if (pack.stemmer.protected_words) {
|
|
18
|
+
if (pack.stemmer.protected_words.includes(word)) {
|
|
19
|
+
return word;
|
|
20
|
+
}
|
|
21
|
+
// Check if stripping any prefix yields a protected word
|
|
22
|
+
const prefixes = pack.stemmer.prefixes || [];
|
|
23
|
+
for (const prefix of prefixes) {
|
|
24
|
+
if (word.startsWith(prefix)) {
|
|
25
|
+
const stripped = word.substring(prefix.length);
|
|
26
|
+
if (pack.stemmer.protected_words.includes(stripped)) {
|
|
27
|
+
return stripped;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
let cv_string = felig_transliterate(word, "am", pack); // consonant-vowel string
|
|
33
|
+
const sfx_arr = [];
|
|
34
|
+
const pfx_arr = [];
|
|
35
|
+
// Prepare suffix array
|
|
36
|
+
const sarr = pack.stemmer.suffixes || [];
|
|
37
|
+
sarr.forEach((suffix) => {
|
|
38
|
+
sfx_arr.push(felig_transliterate(suffix, "am", pack));
|
|
39
|
+
if (suffix.startsWith("ዎ")) {
|
|
40
|
+
const altSuffix = "ኦ" + suffix.substring(1);
|
|
41
|
+
sfx_arr.push(felig_transliterate(altSuffix, "am", pack));
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
sfx_arr.push("Wa"); // Special case for ሯ
|
|
45
|
+
sfx_arr.sort((a, b) => b.length - a.length);
|
|
46
|
+
// Prepare prefix array
|
|
47
|
+
const parr = pack.stemmer.prefixes || [];
|
|
48
|
+
parr.forEach((prefix) => {
|
|
49
|
+
pfx_arr.push(felig_transliterate(prefix, "am", pack));
|
|
50
|
+
});
|
|
51
|
+
pfx_arr.sort((a, b) => b.length - a.length);
|
|
52
|
+
// Remove suffixes
|
|
53
|
+
sfx_arr.every(function (sfx, index) {
|
|
54
|
+
if (cv_string.endsWith(sfx)) {
|
|
55
|
+
let regex = new RegExp(`${escapeRegExp(sfx)}$`, `i`);
|
|
56
|
+
cv_string = cv_string.replace(regex, "");
|
|
57
|
+
return false;
|
|
58
|
+
}
|
|
59
|
+
else
|
|
60
|
+
return true;
|
|
61
|
+
});
|
|
62
|
+
// Remove prefixes
|
|
63
|
+
pfx_arr.every(function (pfx, index) {
|
|
64
|
+
if (cv_string.startsWith(pfx)) {
|
|
65
|
+
let regex = new RegExp(`^${escapeRegExp(pfx)}`);
|
|
66
|
+
cv_string = cv_string.replace(regex, "");
|
|
67
|
+
return false;
|
|
68
|
+
}
|
|
69
|
+
else
|
|
70
|
+
return true;
|
|
71
|
+
});
|
|
72
|
+
// Remove infixes
|
|
73
|
+
if (/.+([^aeiou])[aeiou]\1[aeiou].?/i.test(cv_string)) {
|
|
74
|
+
cv_string = cv_string.replace(/\S\S[^aeiou][aeiou]/i, cv_string[0] + cv_string[1]);
|
|
75
|
+
}
|
|
76
|
+
else if (/^(.+)a\1$/i.test(cv_string)) {
|
|
77
|
+
cv_string = cv_string.replace(/a.+/i, "");
|
|
78
|
+
}
|
|
79
|
+
const ccvMatch = cv_string.match(/[bcdfghjklmnpqrstvwxyz]{2}e/i);
|
|
80
|
+
if (ccvMatch) {
|
|
81
|
+
cv_string = cv_string.replace(/[bcdfghjklmnpqrstvwxyz]{2}e/i, ccvMatch[0].substring(0, 1) + "X" + ccvMatch[0].substring(1));
|
|
82
|
+
}
|
|
83
|
+
return felig_transliterate(cv_string, "en", pack);
|
|
84
|
+
}
|
|
85
|
+
export default stem;
|
|
86
|
+
//# sourceMappingURL=stemmer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"stemmer.js","sourceRoot":"","sources":["../src/stemmer.ts"],"names":[],"mappings":"AAAA,mDAAmD;AACnD,kCAAkC;AAClC,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAA;AAGzD,SAAS,YAAY,CAAC,GAAW;IAC/B,OAAO,GAAG,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAA;AACnD,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,IAAI,CAAC,IAAY,EAAE,IAAkB;IACnD,IAAI,IAAI,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;QACjC,IAAI,IAAI,CAAC,OAAO,CAAC,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;YAChD,OAAO,IAAI,CAAA;QACb,CAAC;QACD,wDAAwD;QACxD,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,EAAE,CAAA;QAC5C,KAAK,MAAM,MAAM,IAAI,QAAQ,EAAE,CAAC;YAC9B,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;gBAC5B,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;gBAC9C,IAAI,IAAI,CAAC,OAAO,CAAC,eAAe,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;oBACpD,OAAO,QAAQ,CAAA;gBACjB,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,SAAS,GAAG,mBAAmB,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,CAAA,CAAC,yBAAyB;IAE/E,MAAM,OAAO,GAAa,EAAE,CAAA;IAC5B,MAAM,OAAO,GAAa,EAAE,CAAA;IAE5B,uBAAuB;IACvB,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,EAAE,CAAA;IACxC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;QACtB,OAAO,CAAC,IAAI,CAAC,mBAAmB,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;QACrD,IAAI,MAAM,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,GAAG,GAAG,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,CAAA;YAC3C,OAAO,CAAC,IAAI,CAAC,mBAAmB,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;QAC1D,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA,CAAC,qBAAqB;IACxC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,CAAC,CAAA;IAE3C,uBAAuB;IACvB,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,EAAE,CAAA;IACxC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;QACtB,OAAO,CAAC,IAAI,CAAC,mBAAmB,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,CAAC,CAAA;IAE3C,kBAAkB;IAClB,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,EAAE,KAAK;QAChC,IAAI,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC5B,IAAI,KAAK,GAAG,IAAI,MAAM,CAAC,GAAG,YAAY,CAAC,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAA;YACpD,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAA;YACxC,OAAO,KAAK,CAAA;QACd,CAAC;;YAAM,OAAO,IAAI,CAAA;IACpB,CAAC,CAAC,CAAA;IAEF,kBAAkB;IAClB,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,EAAE,KAAK;QAChC,IAAI,SAAS,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YAC9B,IAAI,KAAK,GAAG,IAAI,MAAM,CAAC,IAAI,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAA;YAC/C,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAA;YACxC,OAAO,KAAK,CAAA;QACd,CAAC;;YAAM,OAAO,IAAI,CAAA;IACpB,CAAC,CAAC,CAAA;IAEF,iBAAiB;IACjB,IAAI,iCAAiC,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QACtD,SAAS,GAAG,SAAS,CAAC,OAAO,CAC3B,sBAAsB,EACtB,SAAS,CAAC,CAAC,CAAC,GAAG,SAAS,CAAC,CAAC,CAAC,CAC5B,CAAA;IACH,CAAC;SAAM,IAAI,YAAY,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QACxC,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAA;IAC3C,CAAC;IAED,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAA;IAChE,IAAI,QAAQ,EAAE,CAAC;QACb,SAAS,GAAG,SAAS,CAAC,OAAO,CAC3B,8BAA8B,EAC9B,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,GAAG,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAC7D,CAAA;IACH,CAAC;IAED,OAAO,mBAAmB,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,CAAA;AACnD,CAAC;AAED,eAAe,IAAI,CAAA"}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { LanguagePack } from './types.js';
|
|
2
|
+
/**
|
|
3
|
+
* Removes commonly occuring words that have no contribution to the semantics of the corpus.
|
|
4
|
+
* @param corpus : Amharic text
|
|
5
|
+
* @param pack : language pack configuration
|
|
6
|
+
* @returns : the corpus without stopwords listed on {@link LanguagePack.stopwords}
|
|
7
|
+
*
|
|
8
|
+
* @example {remove stopwords}
|
|
9
|
+
* removeStopwords("ይህ ሞባይል እና ኮምፒዩተር", amPack) // returns "ሞባይል ኮምፒዩተር"
|
|
10
|
+
*/
|
|
11
|
+
export declare function removeStopwords(corpus: string, pack: LanguagePack): string;
|
|
12
|
+
export default removeStopwords;
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
function escapeRegExp(string) {
|
|
2
|
+
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
3
|
+
}
|
|
4
|
+
/**
|
|
5
|
+
* Removes commonly occuring words that have no contribution to the semantics of the corpus.
|
|
6
|
+
* @param corpus : Amharic text
|
|
7
|
+
* @param pack : language pack configuration
|
|
8
|
+
* @returns : the corpus without stopwords listed on {@link LanguagePack.stopwords}
|
|
9
|
+
*
|
|
10
|
+
* @example {remove stopwords}
|
|
11
|
+
* removeStopwords("ይህ ሞባይል እና ኮምፒዩተር", amPack) // returns "ሞባይል ኮምፒዩተር"
|
|
12
|
+
*/
|
|
13
|
+
export function removeStopwords(corpus, pack) {
|
|
14
|
+
let result = corpus;
|
|
15
|
+
// Sort stopwords by length descending to match longer words first
|
|
16
|
+
const sortedStopwords = [...pack.stopwords].sort((a, b) => b.length - a.length);
|
|
17
|
+
sortedStopwords.forEach((word) => {
|
|
18
|
+
// Regex that matches:
|
|
19
|
+
// Group 1: Leading boundary (non-Ge'ez char or start of string)
|
|
20
|
+
// Group 2: Optional prefix (standard prepositions)
|
|
21
|
+
// Group 3: The stopword itself
|
|
22
|
+
// Group 4: Optional suffix (standard markers)
|
|
23
|
+
// Followed by lookahead for trailing boundary (non-Ge'ez char or end of string)
|
|
24
|
+
const regex = new RegExp(`(^|[^\\u1200-\\u137F])(የ|በ|ከ|ለ|ስለ|የሚ|የማ)?(${escapeRegExp(word)})(ም|ን)?(?=[^\\u1200-\\u137F]|$)`, 'g');
|
|
25
|
+
result = result.replace(regex, (match, p1, p2, p3, p4) => {
|
|
26
|
+
// Keep boundary, prefix, and suffix, remove the stopword itself
|
|
27
|
+
return `${p1}${p2 || ''}${p4 || ''}`;
|
|
28
|
+
});
|
|
29
|
+
});
|
|
30
|
+
return result;
|
|
31
|
+
}
|
|
32
|
+
export default removeStopwords;
|
|
33
|
+
//# sourceMappingURL=stopword_remover.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"stopword_remover.js","sourceRoot":"","sources":["../src/stopword_remover.ts"],"names":[],"mappings":"AAGA,SAAS,YAAY,CAAC,MAAc;IAClC,OAAO,MAAM,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAA;AACtD,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,eAAe,CAAC,MAAc,EAAE,IAAkB;IAChE,IAAI,MAAM,GAAG,MAAM,CAAA;IACnB,kEAAkE;IAClE,MAAM,eAAe,GAAG,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,CAAC,CAAA;IAE/E,eAAe,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;QAC/B,sBAAsB;QACtB,gEAAgE;QAChE,mDAAmD;QACnD,+BAA+B;QAC/B,8CAA8C;QAC9C,gFAAgF;QAChF,MAAM,KAAK,GAAG,IAAI,MAAM,CACtB,6CAA6C,YAAY,CAAC,IAAI,CAAC,iCAAiC,EAChG,GAAG,CACJ,CAAA;QACD,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,KAAK,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE;YACvD,gEAAgE;YAChE,OAAO,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,EAAE,CAAA;QACtC,CAAC,CAAC,CAAA;IACJ,CAAC,CAAC,CAAA;IAEF,OAAO,MAAM,CAAA;AACf,CAAC;AAED,eAAe,eAAe,CAAA"}
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
import type { DocIndexData, QueryIndexData } from "./indexer.js";
|
|
2
|
+
export declare function weighTerms(index: DocIndexData | QueryIndexData, type: "doc" | "query"): Record<string, any>;
|
|
3
|
+
export declare function weigh_terms(indexFilePath: string, outputWeightedTermsPath: string, typeOfIndex: "doc" | "query"): Promise<void>;
|
|
4
|
+
export default weigh_terms;
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
// Pure function — no fs
|
|
2
|
+
export function weighTerms(index, type) {
|
|
3
|
+
const weightedTerms = {};
|
|
4
|
+
if (type === "doc") {
|
|
5
|
+
const dataset = index;
|
|
6
|
+
let tf = 0;
|
|
7
|
+
let idf = 0;
|
|
8
|
+
let tf_idf = 0;
|
|
9
|
+
// calculate length normalized term frequency and inverse document frequency
|
|
10
|
+
Object.keys(dataset.words).forEach((word) => {
|
|
11
|
+
idf = Math.log2(dataset.corpus_size / dataset.words[word].length);
|
|
12
|
+
dataset.words[word].forEach((filePathObj) => {
|
|
13
|
+
let file = Object.keys(filePathObj)[0];
|
|
14
|
+
let freq = Object.values(filePathObj)[0];
|
|
15
|
+
tf = freq / dataset.corpus_word_count[file];
|
|
16
|
+
tf_idf = idf * tf;
|
|
17
|
+
// modify weighted_terms object
|
|
18
|
+
if (word in weightedTerms) {
|
|
19
|
+
weightedTerms[word].push({ [file]: tf_idf });
|
|
20
|
+
}
|
|
21
|
+
else {
|
|
22
|
+
weightedTerms[word] = [{ [file]: tf_idf }];
|
|
23
|
+
}
|
|
24
|
+
});
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
else if (type === "query") {
|
|
28
|
+
const dataset = index;
|
|
29
|
+
let tf = 0;
|
|
30
|
+
let idf = 1;
|
|
31
|
+
let tf_idf = 0;
|
|
32
|
+
// calculate length normalized term frequency and inverse document frequency
|
|
33
|
+
Object.keys(dataset.words).forEach((word) => {
|
|
34
|
+
let freq = dataset.words[word];
|
|
35
|
+
tf = freq / dataset.corpus_word_count;
|
|
36
|
+
tf_idf = idf * tf;
|
|
37
|
+
weightedTerms[word] = tf_idf;
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
return weightedTerms;
|
|
41
|
+
}
|
|
42
|
+
// Backwards-compat Node.js wrapper
|
|
43
|
+
export async function weigh_terms(indexFilePath, outputWeightedTermsPath, typeOfIndex) {
|
|
44
|
+
const fs = await import("fs");
|
|
45
|
+
const weightedTermsPath = outputWeightedTermsPath + `/${typeOfIndex}WeightedTermsFile.json`;
|
|
46
|
+
// read index file
|
|
47
|
+
try {
|
|
48
|
+
const jsonString = fs.readFileSync(indexFilePath, "utf8");
|
|
49
|
+
try {
|
|
50
|
+
const dataset = JSON.parse(jsonString);
|
|
51
|
+
const result = weighTerms(dataset, typeOfIndex);
|
|
52
|
+
// output to file
|
|
53
|
+
const outJsonString = JSON.stringify(result, null, 2);
|
|
54
|
+
try {
|
|
55
|
+
fs.writeFileSync(weightedTermsPath, outJsonString);
|
|
56
|
+
console.log(`Indexed terms successfully weighted`);
|
|
57
|
+
}
|
|
58
|
+
catch (error) {
|
|
59
|
+
console.log("Weighted terms creation failed", error);
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
catch (err) {
|
|
63
|
+
console.log("Error parsing JSON string:", err);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
catch (error) {
|
|
67
|
+
console.log(`Error reading Index file ${indexFilePath} from disk:`, error);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
export default weigh_terms;
|
|
71
|
+
//# sourceMappingURL=term_weighter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"term_weighter.js","sourceRoot":"","sources":["../src/term_weighter.ts"],"names":[],"mappings":"AAEA,wBAAwB;AACxB,MAAM,UAAU,UAAU,CACtB,KAAoC,EACpC,IAAqB;IAErB,MAAM,aAAa,GAAwB,EAAE,CAAC;IAE9C,IAAI,IAAI,KAAK,KAAK,EAAE,CAAC;QACjB,MAAM,OAAO,GAAG,KAAqB,CAAC;QACtC,IAAI,EAAE,GAAG,CAAC,CAAC;QACX,IAAI,GAAG,GAAG,CAAC,CAAC;QACZ,IAAI,MAAM,GAAG,CAAC,CAAC;QAEf,4EAA4E;QAC5E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;YACxC,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC;YAElE,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,OAAO,CACvB,CAAC,WAAmC,EAAE,EAAE;gBACpC,IAAI,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;gBACvC,IAAI,IAAI,GAAG,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;gBAEzC,EAAE,GAAG,IAAI,GAAG,OAAO,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC;gBAC5C,MAAM,GAAG,GAAG,GAAG,EAAE,CAAC;gBAElB,+BAA+B;gBAC/B,IAAI,IAAI,IAAI,aAAa,EAAE,CAAC;oBACxB,aAAa,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;gBACjD,CAAC;qBAAM,CAAC;oBACJ,aAAa,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;gBAC/C,CAAC;YACL,CAAC,CACJ,CAAC;QACN,CAAC,CAAC,CAAC;IACP,CAAC;SAAM,IAAI,IAAI,KAAK,OAAO,EAAE,CAAC;QAC1B,MAAM,OAAO,GAAG,KAAuB,CAAC;QACxC,IAAI,EAAE,GAAG,CAAC,CAAC;QACX,IAAI,GAAG,GAAG,CAAC,CAAC;QACZ,IAAI,MAAM,GAAG,CAAC,CAAC;QAEf,4EAA4E;QAC5E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;YACxC,IAAI,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YAE/B,EAAE,GAAG,IAAI,GAAG,OAAO,CAAC,iBAAiB,CAAC;YACtC,MAAM,GAAG,GAAG,GAAG,EAAE,CAAC;YAElB,aAAa,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC;QACjC,CAAC,CAAC,CAAC;IACP,CAAC;IAED,OAAO,aAAa,CAAC;AACzB,CAAC;AAED,mCAAmC;AACnC,MAAM,CAAC,KAAK,UAAU,WAAW,CAC7B,aAAqB,EACrB,uBAA+B,EAC/B,WAA4B;IAE5B,MAAM,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC;IAC9B,MAAM,iBAAiB,GACnB,uBAAuB,GAAG,IAAI,WAAW,wBAAwB,CAAC;IAEtE,kBAAkB;IAClB,IAAI,CAAC;QACD,MAAM,UAAU,GAAG,EAAE,CAAC,YAAY,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC;QAC1D,IAAI,CAAC;YACD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;YACvC,MAAM,MAAM,GAAG,UAAU,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;YAEhD,iBAAiB;YACjB,MAAM,aAAa,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;YAEtD,IAAI,CAAC;gBACD,EAAE,CAAC,aAAa,CAAC,iBAAiB,EAAE,aAAa,CAAC,CAAC;gBACnD,OAAO,CAAC,GAAG,CAAC,qCAAqC,CAAC,CAAC;YACvD,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACb,OAAO,CAAC,GAAG,CAAC,gCAAgC,EAAE,KAAK,CAAC,CAAC;YACzD,CAAC;QACL,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,OAAO,CAAC,GAAG,CAAC,4BAA4B,EAAE,GAAG,CAAC,CAAC;QACnD,CAAC;IACL,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACb,OAAO,CAAC,GAAG,CACP,4BAA4B,aAAa,aAAa,EACtD,KAAK,CACR,CAAC;IACN,CAAC;AACL,CAAC;AAED,eAAe,WAAW,CAAC"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import type { LanguagePack } from './types.js';
|
|
2
|
+
/**
|
|
3
|
+
*@deprecated Use {@link felig_transliterate} function instead.
|
|
4
|
+
*/
|
|
5
|
+
export declare function sera_transliterate(word: string, lang: "am" | "en", pack: LanguagePack): string;
|
|
6
|
+
/**
|
|
7
|
+
* Transliterates between Amharic and English
|
|
8
|
+
* @param word : English or Amharic word
|
|
9
|
+
* @param lang : language to transliterate form
|
|
10
|
+
* @returns : a transliterated string
|
|
11
|
+
*
|
|
12
|
+
* @example{ transliterate Amharic word to English}
|
|
13
|
+
* flig_transliterate("ወንበር","am") // returns "wenber"
|
|
14
|
+
*/
|
|
15
|
+
export declare function felig_transliterate(word: string, lang: "am" | "en", pack: LanguagePack): string;
|
|
16
|
+
declare const transliterate: {
|
|
17
|
+
sera_transliterate: typeof sera_transliterate;
|
|
18
|
+
felig_transliterate: typeof felig_transliterate;
|
|
19
|
+
};
|
|
20
|
+
export default transliterate;
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
/**
|
|
2
|
+
*@deprecated Use {@link felig_transliterate} function instead.
|
|
3
|
+
*/
|
|
4
|
+
export function sera_transliterate(word, lang, pack) {
|
|
5
|
+
let trans_word = "";
|
|
6
|
+
const sera_transliteration_lookup_table = pack.transliteration.sera.map;
|
|
7
|
+
if (lang === "am") {
|
|
8
|
+
let tokens = word.split("");
|
|
9
|
+
tokens.forEach((letter) => {
|
|
10
|
+
if (sera_transliteration_lookup_table[letter] !== undefined) {
|
|
11
|
+
trans_word += sera_transliteration_lookup_table[letter];
|
|
12
|
+
}
|
|
13
|
+
});
|
|
14
|
+
}
|
|
15
|
+
else if (lang === "en") {
|
|
16
|
+
let tokens = word.match(/.{1,2}/g);
|
|
17
|
+
if (tokens) {
|
|
18
|
+
tokens.forEach((letter) => {
|
|
19
|
+
let en_letter = Object.keys(sera_transliteration_lookup_table).find((key) => sera_transliteration_lookup_table[key] === letter);
|
|
20
|
+
if (en_letter !== undefined) {
|
|
21
|
+
trans_word += en_letter;
|
|
22
|
+
}
|
|
23
|
+
});
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
return trans_word;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Transliterates between Amharic and English
|
|
30
|
+
* @param word : English or Amharic word
|
|
31
|
+
* @param lang : language to transliterate form
|
|
32
|
+
* @returns : a transliterated string
|
|
33
|
+
*
|
|
34
|
+
* @example{ transliterate Amharic word to English}
|
|
35
|
+
* flig_transliterate("ወንበር","am") // returns "wenber"
|
|
36
|
+
*/
|
|
37
|
+
export function felig_transliterate(word, lang, pack) {
|
|
38
|
+
let trans_word = "";
|
|
39
|
+
const felig_transliteration_lookup_table = pack.transliteration.felig.map;
|
|
40
|
+
if (lang === "am") {
|
|
41
|
+
let tokens = word.split("");
|
|
42
|
+
tokens.forEach((letter) => {
|
|
43
|
+
if (felig_transliteration_lookup_table[letter] !== undefined) {
|
|
44
|
+
trans_word += felig_transliteration_lookup_table[letter];
|
|
45
|
+
}
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
else if (lang === "en") {
|
|
49
|
+
let tokens = word.match(/.{1,2}/g);
|
|
50
|
+
if (tokens === null) {
|
|
51
|
+
return "";
|
|
52
|
+
}
|
|
53
|
+
tokens.forEach((letter) => {
|
|
54
|
+
if (/[^aeiou][aeiou]/i.test(letter)) {
|
|
55
|
+
let am_letter = "";
|
|
56
|
+
if (/[W][a]/g.test(letter)) {
|
|
57
|
+
am_letter = Object.keys(felig_transliteration_lookup_table).find((key) => felig_transliteration_lookup_table[key] === letter.toLowerCase());
|
|
58
|
+
}
|
|
59
|
+
else {
|
|
60
|
+
am_letter = Object.keys(felig_transliteration_lookup_table).find((key) => felig_transliteration_lookup_table[key] === letter);
|
|
61
|
+
}
|
|
62
|
+
if (am_letter !== undefined) {
|
|
63
|
+
trans_word += am_letter;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
else {
|
|
67
|
+
let ltrs = letter.split("");
|
|
68
|
+
let am_letter = "";
|
|
69
|
+
ltrs.forEach((ltr) => {
|
|
70
|
+
const found = Object.keys(felig_transliteration_lookup_table).find((key) => felig_transliteration_lookup_table[key] === ltr);
|
|
71
|
+
if (found !== undefined) {
|
|
72
|
+
am_letter += found;
|
|
73
|
+
}
|
|
74
|
+
});
|
|
75
|
+
if (am_letter !== "" && am_letter !== "ኧ") {
|
|
76
|
+
trans_word += am_letter;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
return trans_word;
|
|
82
|
+
}
|
|
83
|
+
const transliterate = {
|
|
84
|
+
sera_transliterate,
|
|
85
|
+
felig_transliterate,
|
|
86
|
+
};
|
|
87
|
+
export default transliterate;
|
|
88
|
+
//# sourceMappingURL=transliterator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"transliterator.js","sourceRoot":"","sources":["../src/transliterator.ts"],"names":[],"mappings":"AAGA;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAAC,IAAY,EAAE,IAAiB,EAAE,IAAkB;IACpF,IAAI,UAAU,GAAG,EAAE,CAAA;IACnB,MAAM,iCAAiC,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,GAAG,CAAA;IAEvE,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QAClB,IAAI,MAAM,GAAa,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAA;QACrC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;YACxB,IAAI,iCAAiC,CAAC,MAAM,CAAC,KAAK,SAAS,EAAE,CAAC;gBAC5D,UAAU,IAAI,iCAAiC,CAAC,MAAM,CAAC,CAAA;YACzD,CAAC;QACH,CAAC,CAAC,CAAA;IACJ,CAAC;SAAM,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QACzB,IAAI,MAAM,GAAoB,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QACnD,IAAI,MAAM,EAAE,CAAC;YACX,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;gBACxB,IAAI,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC,IAAI,CACjE,CAAC,GAAG,EAAE,EAAE,CAAC,iCAAiC,CAAC,GAAG,CAAC,KAAK,MAAM,CAC3D,CAAA;gBACD,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;oBAC5B,UAAU,IAAI,SAAS,CAAA;gBACzB,CAAC;YACH,CAAC,CAAC,CAAA;QACJ,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAA;AACnB,CAAC;AAED;;;;;;;;GAQG;AAEH,MAAM,UAAU,mBAAmB,CAAC,IAAY,EAAE,IAAiB,EAAE,IAAkB;IACrF,IAAI,UAAU,GAAG,EAAE,CAAA;IACnB,MAAM,kCAAkC,GAAG,IAAI,CAAC,eAAe,CAAC,KAAK,CAAC,GAAG,CAAA;IAEzE,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QAClB,IAAI,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAA;QAC3B,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;YACxB,IAAI,kCAAkC,CAAC,MAAM,CAAC,KAAK,SAAS,EAAE,CAAC;gBAC7D,UAAU,IAAI,kCAAkC,CAAC,MAAM,CAAC,CAAA;YAC1D,CAAC;QACH,CAAC,CAAC,CAAA;IACJ,CAAC;SAAM,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QACzB,IAAI,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QAElC,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;YACpB,OAAO,EAAE,CAAA;QACX,CAAC;QAED,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;YACxB,IAAI,kBAAkB,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;gBACpC,IAAI,SAAS,GAAW,EAAE,CAAA;gBAE1B,IAAI,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;oBAC3B,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,kCAAkC,CAAC,CAAC,IAAI,CAC9D,CAAC,GAAG,EAAE,EAAE,CACN,kCAAkC,CAAC,GAAG,CAAC,KAAK,MAAM,CAAC,WAAW,EAAE,CAClE,CAAA;gBACJ,CAAC;qBAAM,CAAC;oBACN,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,kCAAkC,CAAC,CAAC,IAAI,CAC9D,CAAC,GAAG,EAAE,EAAE,CAAC,kCAAkC,CAAC,GAAG,CAAC,KAAK,MAAM,CAC3D,CAAA;gBACJ,CAAC;gBAED,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;oBAC5B,UAAU,IAAI,SAAS,CAAA;gBACzB,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,IAAI,IAAI,GAAG,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAA;gBAC3B,IAAI,SAAS,GAAG,EAAE,CAAA;gBAClB,IAAI,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE;oBACnB,MAAM,KAAK,GAAG,MAAM,CAAC,IAAI,CAAC,kCAAkC,CAAC,CAAC,IAAI,CAChE,CAAC,GAAG,EAAE,EAAE,CAAC,kCAAkC,CAAC,GAAG,CAAC,KAAK,GAAG,CACzD,CAAA;oBACD,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;wBACxB,SAAS,IAAI,KAAK,CAAA;oBACpB,CAAC;gBACH,CAAC,CAAC,CAAA;gBAEF,IAAI,SAAS,KAAK,EAAE,IAAI,SAAS,KAAK,GAAG,EAAE,CAAC;oBAC1C,UAAU,IAAI,SAAS,CAAA;gBACzB,CAAC;YACH,CAAC;QACH,CAAC,CAAC,CAAA;IACJ,CAAC;IAED,OAAO,UAAU,CAAA;AACnB,CAAC;AAED,MAAM,aAAa,GAAG;IACpB,kBAAkB;IAClB,mBAAmB;CACpB,CAAA;AAED,eAAe,aAAa,CAAA"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
export interface LanguagePackMeta {
|
|
2
|
+
code: string;
|
|
3
|
+
name: string;
|
|
4
|
+
native_name?: string;
|
|
5
|
+
script: string;
|
|
6
|
+
version?: string;
|
|
7
|
+
authors?: string[];
|
|
8
|
+
}
|
|
9
|
+
export interface NormalizationConfig {
|
|
10
|
+
char_map: Record<string, string>;
|
|
11
|
+
labialized_map: Record<string, string>;
|
|
12
|
+
gemination_threshold: number;
|
|
13
|
+
}
|
|
14
|
+
export interface TokenizationConfig {
|
|
15
|
+
split_on_spaces: boolean;
|
|
16
|
+
sentence_boundaries: string[];
|
|
17
|
+
punctuation: string[];
|
|
18
|
+
exceptions: Record<string, string[]>;
|
|
19
|
+
}
|
|
20
|
+
export interface StemmerConfig {
|
|
21
|
+
prefixes: string[];
|
|
22
|
+
suffixes: string[];
|
|
23
|
+
protected_words: string[];
|
|
24
|
+
}
|
|
25
|
+
export interface TransliterationSchemeConfig {
|
|
26
|
+
scheme: string;
|
|
27
|
+
map: Record<string, string>;
|
|
28
|
+
}
|
|
29
|
+
export interface TransliterationConfig {
|
|
30
|
+
sera: TransliterationSchemeConfig;
|
|
31
|
+
felig: TransliterationSchemeConfig;
|
|
32
|
+
}
|
|
33
|
+
export interface NumbersConfig {
|
|
34
|
+
ethiopic_to_arabic: Record<string, string>;
|
|
35
|
+
}
|
|
36
|
+
export interface SentimentConfig {
|
|
37
|
+
model: string;
|
|
38
|
+
lexicon: string;
|
|
39
|
+
}
|
|
40
|
+
export interface NERConfig {
|
|
41
|
+
model: string;
|
|
42
|
+
name_lists: string[];
|
|
43
|
+
}
|
|
44
|
+
export interface LanguagePack {
|
|
45
|
+
meta: LanguagePackMeta;
|
|
46
|
+
normalization?: NormalizationConfig;
|
|
47
|
+
tokenization?: TokenizationConfig;
|
|
48
|
+
stopwords: string[];
|
|
49
|
+
stemmer: StemmerConfig;
|
|
50
|
+
transliteration: TransliterationConfig;
|
|
51
|
+
numbers?: NumbersConfig;
|
|
52
|
+
sentiment?: SentimentConfig;
|
|
53
|
+
ner?: NERConfig;
|
|
54
|
+
}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
|
package/package.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@fidel-tools/core",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Amharic Language Pre-processor toolkit",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"types": "dist/index.d.ts",
|
|
7
|
+
"type": "module",
|
|
8
|
+
"files": [
|
|
9
|
+
"dist"
|
|
10
|
+
],
|
|
11
|
+
"publishConfig": {
|
|
12
|
+
"access": "public"
|
|
13
|
+
},
|
|
14
|
+
"keywords": [
|
|
15
|
+
"Amharic",
|
|
16
|
+
"Stemmer",
|
|
17
|
+
"Affixes",
|
|
18
|
+
"Amharic",
|
|
19
|
+
"Stopword",
|
|
20
|
+
"Amharic",
|
|
21
|
+
"Sentiment",
|
|
22
|
+
"analysis"
|
|
23
|
+
],
|
|
24
|
+
"author": "Fidel Tools Team",
|
|
25
|
+
"license": "MIT",
|
|
26
|
+
"repository": {
|
|
27
|
+
"type": "git",
|
|
28
|
+
"url": "https://github.com/Yehonatal/fidel-tools.git"
|
|
29
|
+
},
|
|
30
|
+
"homepage": "https://fidel-tools.vercel.app/",
|
|
31
|
+
"devDependencies": {
|
|
32
|
+
"@babel/core": "^7.19.0",
|
|
33
|
+
"@babel/plugin-transform-modules-commonjs": "^7.18.6",
|
|
34
|
+
"@types/node": "^20.19.42",
|
|
35
|
+
"jest": "^28.1.3",
|
|
36
|
+
"typescript": "^5.0.4"
|
|
37
|
+
},
|
|
38
|
+
"dependencies": {},
|
|
39
|
+
"scripts": {
|
|
40
|
+
"start": "node dist/index.js",
|
|
41
|
+
"build": "tsc",
|
|
42
|
+
"test": "pnpm jest --coverage"
|
|
43
|
+
}
|
|
44
|
+
}
|