@fidel-tools/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Liul Alemayehu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,56 @@
1
+ # @fidel-tools/core
2
+
3
+ The core NLP pipeline and text pre-processing engine for Amharic and Ethiopic script processing.
4
+
5
+ ---
6
+
7
+ ## Features
8
+
9
+ - **Normalization**: Standardizes homophones, expands labialized strings, and collapses character gemination.
10
+ - **Tokenization**: Standard and sentence-level tokenizers with exception mapping (abbreviation expansion).
11
+ - **Stopwords**: Morphology-aware boundary filtering that removes stopwords safely without corrupting base stems.
12
+ - **Light Stemmer**: Prefix- and suffix-removal algorithms for root extraction.
13
+ - **Transliteration**: Bidirectional SERA and Felig ASCII transliterators.
14
+
15
+ ---
16
+
17
+ ## Installation
18
+
19
+ ```bash
20
+ pnpm add @fidel-tools/core
21
+ ```
22
+
23
+ ---
24
+
25
+ ## API & Usage
26
+
27
+ ### 1. Unified Pipeline
28
+
29
+ ```typescript
30
+ import { Pipeline } from '@fidel-tools/core'
31
+ import amPack from '@fidel-tools/lang-am' // Or your custom pack
32
+
33
+ const nlp = new Pipeline(amPack)
34
+
35
+ const normalized = nlp.normalize("ሐኪም ኀይሉ")
36
+ const tokens = nlp.lexAnalyze("ት/ቤት እና መስሪያ ቤት")
37
+ const stemmed = nlp.stem("ልጆቻቸውን")
38
+ ```
39
+
40
+ ### 2. Low-Level Component Exports
41
+
42
+ ```typescript
43
+ import { normalize, sentenceTokenize, removeStopwords, stem } from '@fidel-tools/core'
44
+ import amPack from '@fidel-tools/lang-am'
45
+
46
+ // Individual functional components
47
+ const text = normalize("ሐኪም ኀይሉ", amPack)
48
+ const sentences = sentenceTokenize("ይህ ዓረፍተ ነገር ነው። ያኛው ደግሞ፡", amPack)
49
+ const cleaned = removeStopwords("እና በመሆኑም ትምህርት", amPack)
50
+ ```
51
+
52
+ ---
53
+
54
+ ## License
55
+
56
+ [MIT License](../../LICENSE)
@@ -0,0 +1,10 @@
1
+ export type { LanguagePack, LanguagePackMeta, StemmerConfig, TransliterationConfig } from './types.js';
2
+ export { Pipeline } from './pipeline.js';
3
+ export { normalize } from './normalizer.js';
4
+ export { sentenceTokenize } from './sentence_tokenizer.js';
5
+ export { stem } from './stemmer.js';
6
+ export { removeStopwords } from './stopword_remover.js';
7
+ export { lexAnalyze } from './lexical_analyzer.js';
8
+ export { felig_transliterate, sera_transliterate } from './transliterator.js';
9
+ export { indexDocuments, indexQuery, indexTerms } from './indexer.js';
10
+ export { weighTerms, weigh_terms } from './term_weighter.js';
package/dist/index.js ADDED
@@ -0,0 +1,12 @@
1
+ // Pipeline (primary API)
2
+ export { Pipeline } from './pipeline.js';
3
+ // Individual functions (secondary API, for tree-shaking)
4
+ export { normalize } from './normalizer.js';
5
+ export { sentenceTokenize } from './sentence_tokenizer.js';
6
+ export { stem } from './stemmer.js';
7
+ export { removeStopwords } from './stopword_remover.js';
8
+ export { lexAnalyze } from './lexical_analyzer.js';
9
+ export { felig_transliterate, sera_transliterate } from './transliterator.js';
10
+ export { indexDocuments, indexQuery, indexTerms } from './indexer.js';
11
+ export { weighTerms, weigh_terms } from './term_weighter.js';
12
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAGA,yBAAyB;AACzB,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAA;AAExC,yDAAyD;AACzD,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAA;AAC3C,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAA;AAC1D,OAAO,EAAE,IAAI,EAAE,MAAM,cAAc,CAAA;AACnC,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAA;AACvD,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAA;AAClD,OAAO,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAA;AAC7E,OAAO,EAAE,cAAc,EAAE,UAAU,EAAE,UAAU,EAAE,MAAM,cAAc,CAAA;AACrE,OAAO,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAA"}
@@ -0,0 +1,18 @@
1
+ import type { LanguagePack } from './types.js';
2
+ export interface DocIndexData {
3
+ corpus_size: number;
4
+ corpus_word_count: Record<string, number>;
5
+ words: Record<string, Array<Record<string, number>>>;
6
+ }
7
+ export interface QueryIndexData {
8
+ corpus_size: number;
9
+ corpus_word_count: number;
10
+ words: Record<string, number>;
11
+ }
12
+ export declare function indexDocuments(docs: Array<{
13
+ id: string;
14
+ content: string;
15
+ }>, pack: LanguagePack): DocIndexData;
16
+ export declare function indexQuery(query: string, pack: LanguagePack): QueryIndexData;
17
+ export declare function indexTerms(corpus: string[], outputIndexFilePath: string, type: "doc" | "query", pack: LanguagePack): Promise<void>;
18
+ export default indexTerms;
@@ -0,0 +1,107 @@
1
+ import stem from "./stemmer.js";
2
+ import lexAnalyze from "./lexical_analyzer.js";
3
+ import rmvStopwrd from "./stopword_remover.js";
4
+ // Pure functions — no fs dependency
5
+ export function indexDocuments(docs, pack) {
6
+ const indexData = {
7
+ corpus_size: docs.length,
8
+ corpus_word_count: {},
9
+ words: {}
10
+ };
11
+ docs.forEach((doc) => {
12
+ indexData.corpus_word_count[doc.id] = doc.content.split(" ").length;
13
+ // preprocess
14
+ const unStemmedWords = rmvStopwrd(lexAnalyze(doc.content, pack), pack).split(" ");
15
+ const stemmedWords = unStemmedWords.map((word) => stem(word, pack));
16
+ const result = stemmedWords
17
+ .filter((e) => e)
18
+ .filter((e) => {
19
+ return e.length > 1;
20
+ });
21
+ // index
22
+ let wordFlag = 0;
23
+ result.forEach((word) => {
24
+ if (word in indexData.words) {
25
+ indexData.words[word].forEach((pathObj) => {
26
+ if (doc.id in pathObj) {
27
+ pathObj[doc.id]++;
28
+ wordFlag = 1;
29
+ }
30
+ });
31
+ if (wordFlag === 0) {
32
+ indexData.words[word].push({ [doc.id]: 1 });
33
+ }
34
+ else {
35
+ wordFlag = 0;
36
+ }
37
+ }
38
+ else {
39
+ indexData.words[word] = [{ [doc.id]: 1 }];
40
+ }
41
+ });
42
+ });
43
+ return indexData;
44
+ }
45
+ export function indexQuery(query, pack) {
46
+ const indexData = {
47
+ corpus_size: 1,
48
+ corpus_word_count: query.split(" ").length,
49
+ words: {}
50
+ };
51
+ // preprocess
52
+ const unStemmedWords = rmvStopwrd(lexAnalyze(query, pack), pack).split(" ");
53
+ const stemmedWords = unStemmedWords.map((word) => stem(word, pack));
54
+ const result = stemmedWords
55
+ .filter((e) => e)
56
+ .filter((e) => {
57
+ return e.length > 1;
58
+ });
59
+ // index
60
+ result.forEach((word) => {
61
+ if (word in indexData.words) {
62
+ indexData.words[word]++;
63
+ }
64
+ else {
65
+ indexData.words[word] = 1;
66
+ }
67
+ });
68
+ return indexData;
69
+ }
70
+ // Backwards-compat Node.js wrapper — fs lives here only
71
+ export async function indexTerms(corpus, outputIndexFilePath, type, pack) {
72
+ const fs = await import("fs");
73
+ if (type === "doc") {
74
+ const docs = corpus.map(filePath => {
75
+ try {
76
+ const content = fs.readFileSync(filePath, 'utf8');
77
+ return { id: filePath, content };
78
+ }
79
+ catch (error) {
80
+ console.log(`Error reading ${filePath} file from disk:`, error);
81
+ return { id: filePath, content: "" };
82
+ }
83
+ });
84
+ const result = indexDocuments(docs, pack);
85
+ try {
86
+ fs.writeFileSync(outputIndexFilePath + '/docIndexFile.json', JSON.stringify(result, null, 2));
87
+ docs.forEach(doc => {
88
+ console.log(`Contents of ${doc.id} successfully added to index`);
89
+ });
90
+ }
91
+ catch (error) {
92
+ console.log("Index creation failed", error);
93
+ }
94
+ }
95
+ else {
96
+ try {
97
+ const result = indexQuery(corpus, pack);
98
+ fs.writeFileSync(outputIndexFilePath + '/queryIndexFile.json', JSON.stringify(result, null, 2));
99
+ console.log(`Contents of Query successfully added to index`);
100
+ }
101
+ catch (error) {
102
+ console.log("Index creation failed", error);
103
+ }
104
+ }
105
+ }
106
+ export default indexTerms;
107
+ //# sourceMappingURL=indexer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"indexer.js","sourceRoot":"","sources":["../src/indexer.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,cAAc,CAAA;AAC/B,OAAO,UAAU,MAAM,uBAAuB,CAAA;AAC9C,OAAO,UAAU,MAAM,uBAAuB,CAAA;AAgB9C,oCAAoC;AACpC,MAAM,UAAU,cAAc,CAC5B,IAA4C,EAC5C,IAAkB;IAElB,MAAM,SAAS,GAAiB;QAC9B,WAAW,EAAE,IAAI,CAAC,MAAM;QACxB,iBAAiB,EAAE,EAAE;QACrB,KAAK,EAAE,EAAE;KACV,CAAA;IAED,IAAI,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE;QACnB,SAAS,CAAC,iBAAiB,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAA;QAEnE,aAAa;QACb,MAAM,cAAc,GAAG,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC,OAAO,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;QACjF,MAAM,YAAY,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;QACnE,MAAM,MAAM,GAAG,YAAY;aACxB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;aAChB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACZ,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,CAAA;QACrB,CAAC,CAAC,CAAA;QAEJ,QAAQ;QACR,IAAI,QAAQ,GAAG,CAAC,CAAA;QAChB,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;YACtB,IAAI,IAAI,IAAI,SAAS,CAAC,KAAK,EAAE,CAAC;gBAC5B,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;oBACxC,IAAI,GAAG,CAAC,EAAE,IAAI,OAAO,EAAE,CAAC;wBACtB,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAA;wBACjB,QAAQ,GAAG,CAAC,CAAA;oBACd,CAAC;gBACH,CAAC,CAAC,CAAA;gBACF,IAAI,QAAQ,KAAK,CAAC,EAAE,CAAC;oBACnB,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAA;gBAC7C,CAAC;qBAAM,CAAC;oBACN,QAAQ,GAAG,CAAC,CAAA;gBACd,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAA;YAC3C,CAAC;QACH,CAAC,CAAC,CAAA;IACJ,CAAC,CAAC,CAAA;IAEF,OAAO,SAAS,CAAA;AAClB,CAAC;AAED,MAAM,UAAU,UAAU,CACxB,KAAa,EACb,IAAkB;IAElB,MAAM,SAAS,GAAmB;QAChC,WAAW,EAAE,CAAC;QACd,iBAAiB,EAAE,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM;QAC1C,KAAK,EAAE,EAAE;KACV,CAAA;IAED,aAAa;IACb,MAAM,cAAc,GAAG,UAAU,CAAC,UAAU,CAAC,KAAK,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;IAC3E,MAAM,YAAY,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IACnE,MAAM,MAAM,GAAG,YAAY;SACxB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;SAChB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;QACZ,OAAO,CAAC,CAAC,MAAM,GAAG,CAAC,CAAA;IACrB,CAAC,CAAC,CAAA;IAEJ,QAAQ;IACR,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;QACtB,IAAI,IAAI,IAAI,SAAS,CAAC,KAAK,EAAE,CAAC;YAC5B,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAA;QACzB,CAAC;aAAM,CAAC;YACN,SAAS,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QAC3B,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,OAAO,SAAS,CAAA;AAClB,CAAC;AAED,wDAAwD;AACxD,MAAM,CAAC,KAAK,UAAU,UAAU,CAC9B,MAAgB,EAChB,mBAA2B,EAC3B,IAAqB,EACrB,IAAkB;IAElB,MAAM,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAA;IAC7B,IAAI,IAAI,KAAK,KAAK,EAAE,CAAC;QACnB,MAAM,IAAI,GAAG,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE;YACjC,IAAI,CAAC;gBACH,MAAM,OAAO,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAA;gBACjD,OAAO,EAAE,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAA;YAClC,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,OAAO,CAAC,GAAG,CAAC,iBAAiB,QAAQ,kBAAkB,EAAE,KAAK,CAAC,CAAA;gBAC/D,OAAO,EAAE,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,EAAE,EAAE,CAAA;YACtC,CAAC;QACH,CAAC,CAAC,CAAA;QACF,MAAM,MAAM,GAAG,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,CAAA;QACzC,IAAI,CAAC;YACH,EAAE,CAAC,aAAa,CAAC,mBAAmB,GAAG,oBAAoB,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAA;YAC7F,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;gBACjB,OAAO,CAAC,GAAG,CAAC,eAAe,GAAG,CAAC,EAAE,8BAA8B,CAAC,CAAA;YAClE,CAAC,CAAC,CAAA;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,GAAG,CAAC,uBAAuB,EAAE,KAAK,CAAC,CAAA;QAC7C,CAAC;IACH,CAAC;SAAM,CAAC;QACN,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,UAAU,CAAC,MAA2B,EAAE,IAAI,CAAC,CAAA;YAC5D,EAAE,CAAC,aAAa,CAAC,mBAAmB,GAAG,sBAAsB,EAAE,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAA;YAC/F,OAAO,CAAC,GAAG,CAAC,+CAA+C,CAAC,CAAA;QAC9D,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,GAAG,CAAC,uBAAuB,EAAE,KAAK,CAAC,CAAA;QAC7C,CAAC;IACH,CAAC;AACH,CAAC;AAED,eAAe,UAAU,CAAA"}
@@ -0,0 +1,9 @@
1
+ import type { LanguagePack } from './types.js';
2
+ /**
3
+ * Separates words, expands abbreviations, removes numbers, breaks up hyphenated words, and removes punctuation
4
+ * @param corpus : Amharic text
5
+ * @param pack : language pack configuration
6
+ * @returns : Lexically analyzed Amharic text
7
+ */
8
+ export declare function lexAnalyze(corpus: string, pack: LanguagePack): string;
9
+ export default lexAnalyze;
@@ -0,0 +1,22 @@
1
+ /**
2
+ * Separates words, expands abbreviations, removes numbers, breaks up hyphenated words, and removes punctuation
3
+ * @param corpus : Amharic text
4
+ * @param pack : language pack configuration
5
+ * @returns : Lexically analyzed Amharic text
6
+ */
7
+ export function lexAnalyze(corpus, pack) {
8
+ // Expand exceptions (abbreviations)
9
+ if (pack.tokenization && pack.tokenization.exceptions) {
10
+ for (const key in pack.tokenization.exceptions) {
11
+ const expansion = pack.tokenization.exceptions[key].join(" ");
12
+ corpus = corpus.replaceAll(key, expansion);
13
+ }
14
+ }
15
+ corpus = corpus
16
+ .replace(/[.\?"',/#!$%^&*;:፤።{}=\-_`~()]/g, " ")
17
+ .replace(/[.፩፪፫፬፭፮፯፰፱፲፳፴፵፶፷፸፹፺፻0123456789]/g, " ")
18
+ .replace(/\s{2,}/g, " ");
19
+ return corpus;
20
+ }
21
+ export default lexAnalyze;
22
+ //# sourceMappingURL=lexical_analyzer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"lexical_analyzer.js","sourceRoot":"","sources":["../src/lexical_analyzer.ts"],"names":[],"mappings":"AAGA;;;;;GAKG;AACH,MAAM,UAAU,UAAU,CAAC,MAAc,EAAE,IAAkB;IAC3D,oCAAoC;IACpC,IAAI,IAAI,CAAC,YAAY,IAAI,IAAI,CAAC,YAAY,CAAC,UAAU,EAAE,CAAC;QACtD,KAAK,MAAM,GAAG,IAAI,IAAI,CAAC,YAAY,CAAC,UAAU,EAAE,CAAC;YAC/C,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;YAC7D,MAAM,GAAG,MAAM,CAAC,UAAU,CAAC,GAAG,EAAE,SAAS,CAAC,CAAA;QAC5C,CAAC;IACH,CAAC;IAED,MAAM,GAAG,MAAM;SACZ,OAAO,CAAC,iCAAiC,EAAE,GAAG,CAAC;SAC/C,OAAO,CAAC,mCAAmC,EAAE,GAAG,CAAC;SACjD,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC,CAAA;IAE1B,OAAO,MAAM,CAAA;AACf,CAAC;AAED,eAAe,UAAU,CAAA"}
@@ -0,0 +1,11 @@
1
+ import type { LanguagePack } from './types.js';
2
+ /**
3
+ * Normalizes Amharic text by applying character mapping,
4
+ * labialized sequence normalization, and gemination collapse.
5
+ *
6
+ * @param text The input string to normalize.
7
+ * @param pack The language pack containing normalization configuration.
8
+ * @returns The normalized string.
9
+ */
10
+ export declare function normalize(text: string, pack: LanguagePack): string;
11
+ export default normalize;
@@ -0,0 +1,41 @@
1
+ /**
2
+ * Normalizes Amharic text by applying character mapping,
3
+ * labialized sequence normalization, and gemination collapse.
4
+ *
5
+ * @param text The input string to normalize.
6
+ * @param pack The language pack containing normalization configuration.
7
+ * @returns The normalized string.
8
+ */
9
+ export function normalize(text, pack) {
10
+ if (!pack.normalization) {
11
+ return text;
12
+ }
13
+ let normalized = text;
14
+ // 1. Apply char_map
15
+ const charMap = pack.normalization.char_map || {};
16
+ let chars = normalized.split("");
17
+ for (let i = 0; i < chars.length; i++) {
18
+ if (charMap[chars[i]] !== undefined) {
19
+ chars[i] = charMap[chars[i]];
20
+ }
21
+ }
22
+ normalized = chars.join("");
23
+ // 2. Apply labialized_map
24
+ const labializedMap = pack.normalization.labialized_map || {};
25
+ let chars2 = normalized.split("");
26
+ for (let i = 0; i < chars2.length; i++) {
27
+ if (labializedMap[chars2[i]] !== undefined) {
28
+ chars2[i] = labializedMap[chars2[i]];
29
+ }
30
+ }
31
+ normalized = chars2.join("");
32
+ // 3. Collapse gemination
33
+ const threshold = pack.normalization.gemination_threshold;
34
+ if (threshold !== undefined && threshold > 0) {
35
+ const regex = new RegExp(`([^\\s])\\1{${threshold},}`, 'g');
36
+ normalized = normalized.replace(regex, (match, p1) => p1.repeat(threshold));
37
+ }
38
+ return normalized;
39
+ }
40
+ export default normalize;
41
+ //# sourceMappingURL=normalizer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"normalizer.js","sourceRoot":"","sources":["../src/normalizer.ts"],"names":[],"mappings":"AAEA;;;;;;;GAOG;AACH,MAAM,UAAU,SAAS,CAAC,IAAY,EAAE,IAAkB;IACxD,IAAI,CAAC,IAAI,CAAC,aAAa,EAAE,CAAC;QACxB,OAAO,IAAI,CAAA;IACb,CAAC;IAED,IAAI,UAAU,GAAG,IAAI,CAAA;IAErB,oBAAoB;IACpB,MAAM,OAAO,GAAG,IAAI,CAAC,aAAa,CAAC,QAAQ,IAAI,EAAE,CAAA;IACjD,IAAI,KAAK,GAAG,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC,CAAA;IAChC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,IAAI,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,SAAS,EAAE,CAAC;YACpC,KAAK,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAA;QAC9B,CAAC;IACH,CAAC;IACD,UAAU,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IAE3B,0BAA0B;IAC1B,MAAM,aAAa,GAAG,IAAI,CAAC,aAAa,CAAC,cAAc,IAAI,EAAE,CAAA;IAC7D,IAAI,MAAM,GAAG,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC,CAAA;IACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,IAAI,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,KAAK,SAAS,EAAE,CAAC;YAC3C,MAAM,CAAC,CAAC,CAAC,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAA;QACtC,CAAC;IACH,CAAC;IACD,UAAU,GAAG,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IAE5B,yBAAyB;IACzB,MAAM,SAAS,GAAG,IAAI,CAAC,aAAa,CAAC,oBAAoB,CAAA;IACzD,IAAI,SAAS,KAAK,SAAS,IAAI,SAAS,GAAG,CAAC,EAAE,CAAC;QAC7C,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,eAAe,SAAS,IAAI,EAAE,GAAG,CAAC,CAAA;QAC3D,UAAU,GAAG,UAAU,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,KAAK,EAAE,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAA;IAC7E,CAAC;IAED,OAAO,UAAU,CAAA;AACnB,CAAC;AAED,eAAe,SAAS,CAAA"}
@@ -0,0 +1,19 @@
1
+ import type { LanguagePack } from './types.js';
2
+ import type { DocIndexData, QueryIndexData } from './indexer.js';
3
+ export declare class Pipeline {
4
+ private pack;
5
+ constructor(pack: LanguagePack);
6
+ normalize(text: string): string;
7
+ sentenceTokenize(text: string): string[];
8
+ stem(word: string): string;
9
+ removeStopwords(corpus: string): string;
10
+ lexAnalyze(corpus: string): string;
11
+ feligTransliterate(word: string, lang: "am" | "en"): string;
12
+ seraTransliterate(word: string, lang: "am" | "en"): string;
13
+ indexDocuments(docs: Array<{
14
+ id: string;
15
+ content: string;
16
+ }>): DocIndexData;
17
+ indexQuery(query: string): QueryIndexData;
18
+ weighTerms(index: DocIndexData | QueryIndexData, type: "doc" | "query"): Record<string, any>;
19
+ }
@@ -0,0 +1,44 @@
1
+ import { stem } from './stemmer.js';
2
+ import { removeStopwords } from './stopword_remover.js';
3
+ import { lexAnalyze } from './lexical_analyzer.js';
4
+ import { felig_transliterate, sera_transliterate } from './transliterator.js';
5
+ import { indexDocuments, indexQuery } from './indexer.js';
6
+ import { weighTerms } from './term_weighter.js';
7
+ import { normalize } from './normalizer.js';
8
+ import { sentenceTokenize } from './sentence_tokenizer.js';
9
+ export class Pipeline {
10
+ constructor(pack) {
11
+ this.pack = pack;
12
+ }
13
+ normalize(text) {
14
+ return normalize(text, this.pack);
15
+ }
16
+ sentenceTokenize(text) {
17
+ return sentenceTokenize(text, this.pack);
18
+ }
19
+ stem(word) {
20
+ return stem(word, this.pack);
21
+ }
22
+ removeStopwords(corpus) {
23
+ return removeStopwords(corpus, this.pack);
24
+ }
25
+ lexAnalyze(corpus) {
26
+ return lexAnalyze(corpus, this.pack);
27
+ }
28
+ feligTransliterate(word, lang) {
29
+ return felig_transliterate(word, lang, this.pack);
30
+ }
31
+ seraTransliterate(word, lang) {
32
+ return sera_transliterate(word, lang, this.pack);
33
+ }
34
+ indexDocuments(docs) {
35
+ return indexDocuments(docs, this.pack);
36
+ }
37
+ indexQuery(query) {
38
+ return indexQuery(query, this.pack);
39
+ }
40
+ weighTerms(index, type) {
41
+ return weighTerms(index, type);
42
+ }
43
+ }
44
+ //# sourceMappingURL=pipeline.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pipeline.js","sourceRoot":"","sources":["../src/pipeline.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,IAAI,EAAE,MAAM,cAAc,CAAA;AACnC,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAA;AACvD,OAAO,EAAE,UAAU,EAAE,MAAM,uBAAuB,CAAA;AAClD,OAAO,EAAE,mBAAmB,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAA;AAC7E,OAAO,EAAE,cAAc,EAAE,UAAU,EAAE,MAAM,cAAc,CAAA;AACzD,OAAO,EAAE,UAAU,EAAE,MAAM,oBAAoB,CAAA;AAE/C,OAAO,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAA;AAC3C,OAAO,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAA;AAE1D,MAAM,OAAO,QAAQ;IACnB,YAAoB,IAAkB;QAAlB,SAAI,GAAJ,IAAI,CAAc;IAAG,CAAC;IAE1C,SAAS,CAAC,IAAY;QACpB,OAAO,SAAS,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IACnC,CAAC;IAED,gBAAgB,CAAC,IAAY;QAC3B,OAAO,gBAAgB,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IAC1C,CAAC;IAED,IAAI,CAAC,IAAY;QACf,OAAO,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IAC9B,CAAC;IACD,eAAe,CAAC,MAAc;QAC5B,OAAO,eAAe,CAAC,MAAM,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IAC3C,CAAC;IACD,UAAU,CAAC,MAAc;QACvB,OAAO,UAAU,CAAC,MAAM,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IACtC,CAAC;IACD,kBAAkB,CAAC,IAAY,EAAE,IAAiB;QAChD,OAAO,mBAAmB,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IACnD,CAAC;IACD,iBAAiB,CAAC,IAAY,EAAE,IAAiB;QAC/C,OAAO,kBAAkB,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IAClD,CAAC;IACD,cAAc,CAAC,IAA4C;QACzD,OAAO,cAAc,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IACxC,CAAC;IACD,UAAU,CAAC,KAAa;QACtB,OAAO,UAAU,CAAC,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,CAAA;IACrC,CAAC;IACD,UAAU,CAAC,KAAoC,EAAE,IAAqB;QACpE,OAAO,UAAU,CAAC,KAAK,EAAE,IAAI,CAAC,CAAA;IAChC,CAAC;CACF"}
@@ -0,0 +1,10 @@
1
+ import type { LanguagePack } from './types.js';
2
+ /**
3
+ * Tokenizes Amharic text into sentences using configured sentence boundaries.
4
+ *
5
+ * @param text The input string to split into sentences.
6
+ * @param pack The language pack containing tokenization configuration.
7
+ * @returns An array of sentence strings.
8
+ */
9
+ export declare function sentenceTokenize(text: string, pack: LanguagePack): string[];
10
+ export default sentenceTokenize;
@@ -0,0 +1,22 @@
1
+ /**
2
+ * Tokenizes Amharic text into sentences using configured sentence boundaries.
3
+ *
4
+ * @param text The input string to split into sentences.
5
+ * @param pack The language pack containing tokenization configuration.
6
+ * @returns An array of sentence strings.
7
+ */
8
+ export function sentenceTokenize(text, pack) {
9
+ const boundaries = pack.tokenization?.sentence_boundaries || ["።", "፡", "?", "!", "."];
10
+ if (boundaries.length === 0) {
11
+ return [text];
12
+ }
13
+ // Escape boundaries for Regex character class
14
+ const escaped = boundaries.map(b => b.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')).join('');
15
+ const regex = new RegExp(`[${escaped}]+`, 'g');
16
+ // Split on boundaries, trim whitespace, filter out empty sentences
17
+ return text.split(regex)
18
+ .map(s => s.trim())
19
+ .filter(s => s.length > 0);
20
+ }
21
+ export default sentenceTokenize;
22
+ //# sourceMappingURL=sentence_tokenizer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"sentence_tokenizer.js","sourceRoot":"","sources":["../src/sentence_tokenizer.ts"],"names":[],"mappings":"AAEA;;;;;;GAMG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAY,EAAE,IAAkB;IAC/D,MAAM,UAAU,GAAG,IAAI,CAAC,YAAY,EAAE,mBAAmB,IAAI,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAA;IACtF,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC5B,OAAO,CAAC,IAAI,CAAC,CAAA;IACf,CAAC;IAED,8CAA8C;IAC9C,MAAM,OAAO,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IACtF,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,IAAI,OAAO,IAAI,EAAE,GAAG,CAAC,CAAA;IAE9C,mEAAmE;IACnE,OAAO,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC;SACrB,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;SAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AAC9B,CAAC;AAED,eAAe,gBAAgB,CAAA"}
@@ -0,0 +1,12 @@
1
+ import type { LanguagePack } from "./types.js";
2
+ /**
3
+ * Takes an Amharic word and returns the stem through affix-removal with longest match.
4
+ * @param word : word possibly containing one or more affix
5
+ * @param pack : the language pack configuration
6
+ * @returns : the stem of the word passed
7
+ *
8
+ * @example {stem word with affix}
9
+ * stem("ልጆቻቸውን", amPack) // returns "ልጅ"
10
+ */
11
+ export declare function stem(word: string, pack: LanguagePack): string;
12
+ export default stem;
@@ -0,0 +1,86 @@
1
+ // Takes Amharic language words and produces a stem
2
+ // ልጆች -> ልጅኦች -> ljoc -> lj -> ልጅ
3
+ import { felig_transliterate } from "./transliterator.js";
4
+ function escapeRegExp(str) {
5
+ return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
6
+ }
7
+ /**
8
+ * Takes an Amharic word and returns the stem through affix-removal with longest match.
9
+ * @param word : word possibly containing one or more affix
10
+ * @param pack : the language pack configuration
11
+ * @returns : the stem of the word passed
12
+ *
13
+ * @example {stem word with affix}
14
+ * stem("ልጆቻቸውን", amPack) // returns "ልጅ"
15
+ */
16
+ export function stem(word, pack) {
17
+ if (pack.stemmer.protected_words) {
18
+ if (pack.stemmer.protected_words.includes(word)) {
19
+ return word;
20
+ }
21
+ // Check if stripping any prefix yields a protected word
22
+ const prefixes = pack.stemmer.prefixes || [];
23
+ for (const prefix of prefixes) {
24
+ if (word.startsWith(prefix)) {
25
+ const stripped = word.substring(prefix.length);
26
+ if (pack.stemmer.protected_words.includes(stripped)) {
27
+ return stripped;
28
+ }
29
+ }
30
+ }
31
+ }
32
+ let cv_string = felig_transliterate(word, "am", pack); // consonant-vowel string
33
+ const sfx_arr = [];
34
+ const pfx_arr = [];
35
+ // Prepare suffix array
36
+ const sarr = pack.stemmer.suffixes || [];
37
+ sarr.forEach((suffix) => {
38
+ sfx_arr.push(felig_transliterate(suffix, "am", pack));
39
+ if (suffix.startsWith("ዎ")) {
40
+ const altSuffix = "ኦ" + suffix.substring(1);
41
+ sfx_arr.push(felig_transliterate(altSuffix, "am", pack));
42
+ }
43
+ });
44
+ sfx_arr.push("Wa"); // Special case for ሯ
45
+ sfx_arr.sort((a, b) => b.length - a.length);
46
+ // Prepare prefix array
47
+ const parr = pack.stemmer.prefixes || [];
48
+ parr.forEach((prefix) => {
49
+ pfx_arr.push(felig_transliterate(prefix, "am", pack));
50
+ });
51
+ pfx_arr.sort((a, b) => b.length - a.length);
52
+ // Remove suffixes
53
+ sfx_arr.every(function (sfx, index) {
54
+ if (cv_string.endsWith(sfx)) {
55
+ let regex = new RegExp(`${escapeRegExp(sfx)}$`, `i`);
56
+ cv_string = cv_string.replace(regex, "");
57
+ return false;
58
+ }
59
+ else
60
+ return true;
61
+ });
62
+ // Remove prefixes
63
+ pfx_arr.every(function (pfx, index) {
64
+ if (cv_string.startsWith(pfx)) {
65
+ let regex = new RegExp(`^${escapeRegExp(pfx)}`);
66
+ cv_string = cv_string.replace(regex, "");
67
+ return false;
68
+ }
69
+ else
70
+ return true;
71
+ });
72
+ // Remove infixes
73
+ if (/.+([^aeiou])[aeiou]\1[aeiou].?/i.test(cv_string)) {
74
+ cv_string = cv_string.replace(/\S\S[^aeiou][aeiou]/i, cv_string[0] + cv_string[1]);
75
+ }
76
+ else if (/^(.+)a\1$/i.test(cv_string)) {
77
+ cv_string = cv_string.replace(/a.+/i, "");
78
+ }
79
+ const ccvMatch = cv_string.match(/[bcdfghjklmnpqrstvwxyz]{2}e/i);
80
+ if (ccvMatch) {
81
+ cv_string = cv_string.replace(/[bcdfghjklmnpqrstvwxyz]{2}e/i, ccvMatch[0].substring(0, 1) + "X" + ccvMatch[0].substring(1));
82
+ }
83
+ return felig_transliterate(cv_string, "en", pack);
84
+ }
85
+ export default stem;
86
+ //# sourceMappingURL=stemmer.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"stemmer.js","sourceRoot":"","sources":["../src/stemmer.ts"],"names":[],"mappings":"AAAA,mDAAmD;AACnD,kCAAkC;AAClC,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAA;AAGzD,SAAS,YAAY,CAAC,GAAW;IAC/B,OAAO,GAAG,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAA;AACnD,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,IAAI,CAAC,IAAY,EAAE,IAAkB;IACnD,IAAI,IAAI,CAAC,OAAO,CAAC,eAAe,EAAE,CAAC;QACjC,IAAI,IAAI,CAAC,OAAO,CAAC,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;YAChD,OAAO,IAAI,CAAA;QACb,CAAC;QACD,wDAAwD;QACxD,MAAM,QAAQ,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,EAAE,CAAA;QAC5C,KAAK,MAAM,MAAM,IAAI,QAAQ,EAAE,CAAC;YAC9B,IAAI,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;gBAC5B,MAAM,QAAQ,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;gBAC9C,IAAI,IAAI,CAAC,OAAO,CAAC,eAAe,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,CAAC;oBACpD,OAAO,QAAQ,CAAA;gBACjB,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,IAAI,SAAS,GAAG,mBAAmB,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,CAAC,CAAA,CAAC,yBAAyB;IAE/E,MAAM,OAAO,GAAa,EAAE,CAAA;IAC5B,MAAM,OAAO,GAAa,EAAE,CAAA;IAE5B,uBAAuB;IACvB,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,EAAE,CAAA;IACxC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;QACtB,OAAO,CAAC,IAAI,CAAC,mBAAmB,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;QACrD,IAAI,MAAM,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YAC3B,MAAM,SAAS,GAAG,GAAG,GAAG,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,CAAA;YAC3C,OAAO,CAAC,IAAI,CAAC,mBAAmB,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;QAC1D,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA,CAAC,qBAAqB;IACxC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,CAAC,CAAA;IAE3C,uBAAuB;IACvB,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,QAAQ,IAAI,EAAE,CAAA;IACxC,IAAI,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;QACtB,OAAO,CAAC,IAAI,CAAC,mBAAmB,CAAC,MAAM,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IACF,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,CAAC,CAAA;IAE3C,kBAAkB;IAClB,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,EAAE,KAAK;QAChC,IAAI,SAAS,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC5B,IAAI,KAAK,GAAG,IAAI,MAAM,CAAC,GAAG,YAAY,CAAC,GAAG,CAAC,GAAG,EAAE,GAAG,CAAC,CAAA;YACpD,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAA;YACxC,OAAO,KAAK,CAAA;QACd,CAAC;;YAAM,OAAO,IAAI,CAAA;IACpB,CAAC,CAAC,CAAA;IAEF,kBAAkB;IAClB,OAAO,CAAC,KAAK,CAAC,UAAU,GAAG,EAAE,KAAK;QAChC,IAAI,SAAS,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;YAC9B,IAAI,KAAK,GAAG,IAAI,MAAM,CAAC,IAAI,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAA;YAC/C,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAA;YACxC,OAAO,KAAK,CAAA;QACd,CAAC;;YAAM,OAAO,IAAI,CAAA;IACpB,CAAC,CAAC,CAAA;IAEF,iBAAiB;IACjB,IAAI,iCAAiC,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QACtD,SAAS,GAAG,SAAS,CAAC,OAAO,CAC3B,sBAAsB,EACtB,SAAS,CAAC,CAAC,CAAC,GAAG,SAAS,CAAC,CAAC,CAAC,CAC5B,CAAA;IACH,CAAC;SAAM,IAAI,YAAY,CAAC,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC;QACxC,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,CAAA;IAC3C,CAAC;IAED,MAAM,QAAQ,GAAG,SAAS,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAA;IAChE,IAAI,QAAQ,EAAE,CAAC;QACb,SAAS,GAAG,SAAS,CAAC,OAAO,CAC3B,8BAA8B,EAC9B,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,GAAG,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAC7D,CAAA;IACH,CAAC;IAED,OAAO,mBAAmB,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,CAAA;AACnD,CAAC;AAED,eAAe,IAAI,CAAA"}
@@ -0,0 +1,12 @@
1
+ import type { LanguagePack } from './types.js';
2
+ /**
3
+ * Removes commonly occuring words that have no contribution to the semantics of the corpus.
4
+ * @param corpus : Amharic text
5
+ * @param pack : language pack configuration
6
+ * @returns : the corpus without stopwords listed on {@link LanguagePack.stopwords}
7
+ *
8
+ * @example {remove stopwords}
9
+ * removeStopwords("ይህ ሞባይል እና ኮምፒዩተር", amPack) // returns "ሞባይል ኮምፒዩተር"
10
+ */
11
+ export declare function removeStopwords(corpus: string, pack: LanguagePack): string;
12
+ export default removeStopwords;
@@ -0,0 +1,33 @@
1
+ function escapeRegExp(string) {
2
+ return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
3
+ }
4
+ /**
5
+ * Removes commonly occuring words that have no contribution to the semantics of the corpus.
6
+ * @param corpus : Amharic text
7
+ * @param pack : language pack configuration
8
+ * @returns : the corpus without stopwords listed on {@link LanguagePack.stopwords}
9
+ *
10
+ * @example {remove stopwords}
11
+ * removeStopwords("ይህ ሞባይል እና ኮምፒዩተር", amPack) // returns "ሞባይል ኮምፒዩተር"
12
+ */
13
+ export function removeStopwords(corpus, pack) {
14
+ let result = corpus;
15
+ // Sort stopwords by length descending to match longer words first
16
+ const sortedStopwords = [...pack.stopwords].sort((a, b) => b.length - a.length);
17
+ sortedStopwords.forEach((word) => {
18
+ // Regex that matches:
19
+ // Group 1: Leading boundary (non-Ge'ez char or start of string)
20
+ // Group 2: Optional prefix (standard prepositions)
21
+ // Group 3: The stopword itself
22
+ // Group 4: Optional suffix (standard markers)
23
+ // Followed by lookahead for trailing boundary (non-Ge'ez char or end of string)
24
+ const regex = new RegExp(`(^|[^\\u1200-\\u137F])(የ|በ|ከ|ለ|ስለ|የሚ|የማ)?(${escapeRegExp(word)})(ም|ን)?(?=[^\\u1200-\\u137F]|$)`, 'g');
25
+ result = result.replace(regex, (match, p1, p2, p3, p4) => {
26
+ // Keep boundary, prefix, and suffix, remove the stopword itself
27
+ return `${p1}${p2 || ''}${p4 || ''}`;
28
+ });
29
+ });
30
+ return result;
31
+ }
32
+ export default removeStopwords;
33
+ //# sourceMappingURL=stopword_remover.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"stopword_remover.js","sourceRoot":"","sources":["../src/stopword_remover.ts"],"names":[],"mappings":"AAGA,SAAS,YAAY,CAAC,MAAc;IAClC,OAAO,MAAM,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,CAAA;AACtD,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,eAAe,CAAC,MAAc,EAAE,IAAkB;IAChE,IAAI,MAAM,GAAG,MAAM,CAAA;IACnB,kEAAkE;IAClE,MAAM,eAAe,GAAG,CAAC,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,MAAM,CAAC,CAAA;IAE/E,eAAe,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;QAC/B,sBAAsB;QACtB,gEAAgE;QAChE,mDAAmD;QACnD,+BAA+B;QAC/B,8CAA8C;QAC9C,gFAAgF;QAChF,MAAM,KAAK,GAAG,IAAI,MAAM,CACtB,6CAA6C,YAAY,CAAC,IAAI,CAAC,iCAAiC,EAChG,GAAG,CACJ,CAAA;QACD,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC,KAAK,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE,EAAE;YACvD,gEAAgE;YAChE,OAAO,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,GAAG,EAAE,IAAI,EAAE,EAAE,CAAA;QACtC,CAAC,CAAC,CAAA;IACJ,CAAC,CAAC,CAAA;IAEF,OAAO,MAAM,CAAA;AACf,CAAC;AAED,eAAe,eAAe,CAAA"}
@@ -0,0 +1,4 @@
1
+ import type { DocIndexData, QueryIndexData } from "./indexer.js";
2
+ export declare function weighTerms(index: DocIndexData | QueryIndexData, type: "doc" | "query"): Record<string, any>;
3
+ export declare function weigh_terms(indexFilePath: string, outputWeightedTermsPath: string, typeOfIndex: "doc" | "query"): Promise<void>;
4
+ export default weigh_terms;
@@ -0,0 +1,71 @@
1
+ // Pure function — no fs
2
+ export function weighTerms(index, type) {
3
+ const weightedTerms = {};
4
+ if (type === "doc") {
5
+ const dataset = index;
6
+ let tf = 0;
7
+ let idf = 0;
8
+ let tf_idf = 0;
9
+ // calculate length normalized term frequency and inverse document frequency
10
+ Object.keys(dataset.words).forEach((word) => {
11
+ idf = Math.log2(dataset.corpus_size / dataset.words[word].length);
12
+ dataset.words[word].forEach((filePathObj) => {
13
+ let file = Object.keys(filePathObj)[0];
14
+ let freq = Object.values(filePathObj)[0];
15
+ tf = freq / dataset.corpus_word_count[file];
16
+ tf_idf = idf * tf;
17
+ // modify weighted_terms object
18
+ if (word in weightedTerms) {
19
+ weightedTerms[word].push({ [file]: tf_idf });
20
+ }
21
+ else {
22
+ weightedTerms[word] = [{ [file]: tf_idf }];
23
+ }
24
+ });
25
+ });
26
+ }
27
+ else if (type === "query") {
28
+ const dataset = index;
29
+ let tf = 0;
30
+ let idf = 1;
31
+ let tf_idf = 0;
32
+ // calculate length normalized term frequency and inverse document frequency
33
+ Object.keys(dataset.words).forEach((word) => {
34
+ let freq = dataset.words[word];
35
+ tf = freq / dataset.corpus_word_count;
36
+ tf_idf = idf * tf;
37
+ weightedTerms[word] = tf_idf;
38
+ });
39
+ }
40
+ return weightedTerms;
41
+ }
42
+ // Backwards-compat Node.js wrapper
43
+ export async function weigh_terms(indexFilePath, outputWeightedTermsPath, typeOfIndex) {
44
+ const fs = await import("fs");
45
+ const weightedTermsPath = outputWeightedTermsPath + `/${typeOfIndex}WeightedTermsFile.json`;
46
+ // read index file
47
+ try {
48
+ const jsonString = fs.readFileSync(indexFilePath, "utf8");
49
+ try {
50
+ const dataset = JSON.parse(jsonString);
51
+ const result = weighTerms(dataset, typeOfIndex);
52
+ // output to file
53
+ const outJsonString = JSON.stringify(result, null, 2);
54
+ try {
55
+ fs.writeFileSync(weightedTermsPath, outJsonString);
56
+ console.log(`Indexed terms successfully weighted`);
57
+ }
58
+ catch (error) {
59
+ console.log("Weighted terms creation failed", error);
60
+ }
61
+ }
62
+ catch (err) {
63
+ console.log("Error parsing JSON string:", err);
64
+ }
65
+ }
66
+ catch (error) {
67
+ console.log(`Error reading Index file ${indexFilePath} from disk:`, error);
68
+ }
69
+ }
70
+ export default weigh_terms;
71
+ //# sourceMappingURL=term_weighter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"term_weighter.js","sourceRoot":"","sources":["../src/term_weighter.ts"],"names":[],"mappings":"AAEA,wBAAwB;AACxB,MAAM,UAAU,UAAU,CACtB,KAAoC,EACpC,IAAqB;IAErB,MAAM,aAAa,GAAwB,EAAE,CAAC;IAE9C,IAAI,IAAI,KAAK,KAAK,EAAE,CAAC;QACjB,MAAM,OAAO,GAAG,KAAqB,CAAC;QACtC,IAAI,EAAE,GAAG,CAAC,CAAC;QACX,IAAI,GAAG,GAAG,CAAC,CAAC;QACZ,IAAI,MAAM,GAAG,CAAC,CAAC;QAEf,4EAA4E;QAC5E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;YACxC,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC;YAElE,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,OAAO,CACvB,CAAC,WAAmC,EAAE,EAAE;gBACpC,IAAI,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;gBACvC,IAAI,IAAI,GAAG,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;gBAEzC,EAAE,GAAG,IAAI,GAAG,OAAO,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC;gBAC5C,MAAM,GAAG,GAAG,GAAG,EAAE,CAAC;gBAElB,+BAA+B;gBAC/B,IAAI,IAAI,IAAI,aAAa,EAAE,CAAC;oBACxB,aAAa,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;gBACjD,CAAC;qBAAM,CAAC;oBACJ,aAAa,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC;gBAC/C,CAAC;YACL,CAAC,CACJ,CAAC;QACN,CAAC,CAAC,CAAC;IACP,CAAC;SAAM,IAAI,IAAI,KAAK,OAAO,EAAE,CAAC;QAC1B,MAAM,OAAO,GAAG,KAAuB,CAAC;QACxC,IAAI,EAAE,GAAG,CAAC,CAAC;QACX,IAAI,GAAG,GAAG,CAAC,CAAC;QACZ,IAAI,MAAM,GAAG,CAAC,CAAC;QAEf,4EAA4E;QAC5E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE;YACxC,IAAI,IAAI,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YAE/B,EAAE,GAAG,IAAI,GAAG,OAAO,CAAC,iBAAiB,CAAC;YACtC,MAAM,GAAG,GAAG,GAAG,EAAE,CAAC;YAElB,aAAa,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC;QACjC,CAAC,CAAC,CAAC;IACP,CAAC;IAED,OAAO,aAAa,CAAC;AACzB,CAAC;AAED,mCAAmC;AACnC,MAAM,CAAC,KAAK,UAAU,WAAW,CAC7B,aAAqB,EACrB,uBAA+B,EAC/B,WAA4B;IAE5B,MAAM,EAAE,GAAG,MAAM,MAAM,CAAC,IAAI,CAAC,CAAC;IAC9B,MAAM,iBAAiB,GACnB,uBAAuB,GAAG,IAAI,WAAW,wBAAwB,CAAC;IAEtE,kBAAkB;IAClB,IAAI,CAAC;QACD,MAAM,UAAU,GAAG,EAAE,CAAC,YAAY,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC;QAC1D,IAAI,CAAC;YACD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC;YACvC,MAAM,MAAM,GAAG,UAAU,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;YAEhD,iBAAiB;YACjB,MAAM,aAAa,GAAG,IAAI,CAAC,SAAS,CAAC,MAAM,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;YAEtD,IAAI,CAAC;gBACD,EAAE,CAAC,aAAa,CAAC,iBAAiB,EAAE,aAAa,CAAC,CAAC;gBACnD,OAAO,CAAC,GAAG,CAAC,qCAAqC,CAAC,CAAC;YACvD,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACb,OAAO,CAAC,GAAG,CAAC,gCAAgC,EAAE,KAAK,CAAC,CAAC;YACzD,CAAC;QACL,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACX,OAAO,CAAC,GAAG,CAAC,4BAA4B,EAAE,GAAG,CAAC,CAAC;QACnD,CAAC;IACL,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACb,OAAO,CAAC,GAAG,CACP,4BAA4B,aAAa,aAAa,EACtD,KAAK,CACR,CAAC;IACN,CAAC;AACL,CAAC;AAED,eAAe,WAAW,CAAC"}
@@ -0,0 +1,20 @@
1
+ import type { LanguagePack } from './types.js';
2
+ /**
3
+ *@deprecated Use {@link felig_transliterate} function instead.
4
+ */
5
+ export declare function sera_transliterate(word: string, lang: "am" | "en", pack: LanguagePack): string;
6
+ /**
7
+ * Transliterates between Amharic and English
8
+ * @param word : English or Amharic word
9
+ * @param lang : language to transliterate form
10
+ * @returns : a transliterated string
11
+ *
12
+ * @example{ transliterate Amharic word to English}
13
+ * flig_transliterate("ወንበር","am") // returns "wenber"
14
+ */
15
+ export declare function felig_transliterate(word: string, lang: "am" | "en", pack: LanguagePack): string;
16
+ declare const transliterate: {
17
+ sera_transliterate: typeof sera_transliterate;
18
+ felig_transliterate: typeof felig_transliterate;
19
+ };
20
+ export default transliterate;
@@ -0,0 +1,88 @@
1
+ /**
2
+ *@deprecated Use {@link felig_transliterate} function instead.
3
+ */
4
+ export function sera_transliterate(word, lang, pack) {
5
+ let trans_word = "";
6
+ const sera_transliteration_lookup_table = pack.transliteration.sera.map;
7
+ if (lang === "am") {
8
+ let tokens = word.split("");
9
+ tokens.forEach((letter) => {
10
+ if (sera_transliteration_lookup_table[letter] !== undefined) {
11
+ trans_word += sera_transliteration_lookup_table[letter];
12
+ }
13
+ });
14
+ }
15
+ else if (lang === "en") {
16
+ let tokens = word.match(/.{1,2}/g);
17
+ if (tokens) {
18
+ tokens.forEach((letter) => {
19
+ let en_letter = Object.keys(sera_transliteration_lookup_table).find((key) => sera_transliteration_lookup_table[key] === letter);
20
+ if (en_letter !== undefined) {
21
+ trans_word += en_letter;
22
+ }
23
+ });
24
+ }
25
+ }
26
+ return trans_word;
27
+ }
28
+ /**
29
+ * Transliterates between Amharic and English
30
+ * @param word : English or Amharic word
31
+ * @param lang : language to transliterate form
32
+ * @returns : a transliterated string
33
+ *
34
+ * @example{ transliterate Amharic word to English}
35
+ * flig_transliterate("ወንበር","am") // returns "wenber"
36
+ */
37
+ export function felig_transliterate(word, lang, pack) {
38
+ let trans_word = "";
39
+ const felig_transliteration_lookup_table = pack.transliteration.felig.map;
40
+ if (lang === "am") {
41
+ let tokens = word.split("");
42
+ tokens.forEach((letter) => {
43
+ if (felig_transliteration_lookup_table[letter] !== undefined) {
44
+ trans_word += felig_transliteration_lookup_table[letter];
45
+ }
46
+ });
47
+ }
48
+ else if (lang === "en") {
49
+ let tokens = word.match(/.{1,2}/g);
50
+ if (tokens === null) {
51
+ return "";
52
+ }
53
+ tokens.forEach((letter) => {
54
+ if (/[^aeiou][aeiou]/i.test(letter)) {
55
+ let am_letter = "";
56
+ if (/[W][a]/g.test(letter)) {
57
+ am_letter = Object.keys(felig_transliteration_lookup_table).find((key) => felig_transliteration_lookup_table[key] === letter.toLowerCase());
58
+ }
59
+ else {
60
+ am_letter = Object.keys(felig_transliteration_lookup_table).find((key) => felig_transliteration_lookup_table[key] === letter);
61
+ }
62
+ if (am_letter !== undefined) {
63
+ trans_word += am_letter;
64
+ }
65
+ }
66
+ else {
67
+ let ltrs = letter.split("");
68
+ let am_letter = "";
69
+ ltrs.forEach((ltr) => {
70
+ const found = Object.keys(felig_transliteration_lookup_table).find((key) => felig_transliteration_lookup_table[key] === ltr);
71
+ if (found !== undefined) {
72
+ am_letter += found;
73
+ }
74
+ });
75
+ if (am_letter !== "" && am_letter !== "ኧ") {
76
+ trans_word += am_letter;
77
+ }
78
+ }
79
+ });
80
+ }
81
+ return trans_word;
82
+ }
83
+ const transliterate = {
84
+ sera_transliterate,
85
+ felig_transliterate,
86
+ };
87
+ export default transliterate;
88
+ //# sourceMappingURL=transliterator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"transliterator.js","sourceRoot":"","sources":["../src/transliterator.ts"],"names":[],"mappings":"AAGA;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAAC,IAAY,EAAE,IAAiB,EAAE,IAAkB;IACpF,IAAI,UAAU,GAAG,EAAE,CAAA;IACnB,MAAM,iCAAiC,GAAG,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,GAAG,CAAA;IAEvE,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QAClB,IAAI,MAAM,GAAa,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAA;QACrC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;YACxB,IAAI,iCAAiC,CAAC,MAAM,CAAC,KAAK,SAAS,EAAE,CAAC;gBAC5D,UAAU,IAAI,iCAAiC,CAAC,MAAM,CAAC,CAAA;YACzD,CAAC;QACH,CAAC,CAAC,CAAA;IACJ,CAAC;SAAM,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QACzB,IAAI,MAAM,GAAoB,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QACnD,IAAI,MAAM,EAAE,CAAC;YACX,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;gBACxB,IAAI,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC,IAAI,CACjE,CAAC,GAAG,EAAE,EAAE,CAAC,iCAAiC,CAAC,GAAG,CAAC,KAAK,MAAM,CAC3D,CAAA;gBACD,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;oBAC5B,UAAU,IAAI,SAAS,CAAA;gBACzB,CAAC;YACH,CAAC,CAAC,CAAA;QACJ,CAAC;IACH,CAAC;IAED,OAAO,UAAU,CAAA;AACnB,CAAC;AAED;;;;;;;;GAQG;AAEH,MAAM,UAAU,mBAAmB,CAAC,IAAY,EAAE,IAAiB,EAAE,IAAkB;IACrF,IAAI,UAAU,GAAG,EAAE,CAAA;IACnB,MAAM,kCAAkC,GAAG,IAAI,CAAC,eAAe,CAAC,KAAK,CAAC,GAAG,CAAA;IAEzE,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QAClB,IAAI,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAA;QAC3B,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;YACxB,IAAI,kCAAkC,CAAC,MAAM,CAAC,KAAK,SAAS,EAAE,CAAC;gBAC7D,UAAU,IAAI,kCAAkC,CAAC,MAAM,CAAC,CAAA;YAC1D,CAAC;QACH,CAAC,CAAC,CAAA;IACJ,CAAC;SAAM,IAAI,IAAI,KAAK,IAAI,EAAE,CAAC;QACzB,IAAI,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QAElC,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;YACpB,OAAO,EAAE,CAAA;QACX,CAAC;QAED,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE;YACxB,IAAI,kBAAkB,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;gBACpC,IAAI,SAAS,GAAW,EAAE,CAAA;gBAE1B,IAAI,SAAS,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;oBAC3B,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,kCAAkC,CAAC,CAAC,IAAI,CAC9D,CAAC,GAAG,EAAE,EAAE,CACN,kCAAkC,CAAC,GAAG,CAAC,KAAK,MAAM,CAAC,WAAW,EAAE,CAClE,CAAA;gBACJ,CAAC;qBAAM,CAAC;oBACN,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,kCAAkC,CAAC,CAAC,IAAI,CAC9D,CAAC,GAAG,EAAE,EAAE,CAAC,kCAAkC,CAAC,GAAG,CAAC,KAAK,MAAM,CAC3D,CAAA;gBACJ,CAAC;gBAED,IAAI,SAAS,KAAK,SAAS,EAAE,CAAC;oBAC5B,UAAU,IAAI,SAAS,CAAA;gBACzB,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,IAAI,IAAI,GAAG,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,CAAA;gBAC3B,IAAI,SAAS,GAAG,EAAE,CAAA;gBAClB,IAAI,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE;oBACnB,MAAM,KAAK,GAAG,MAAM,CAAC,IAAI,CAAC,kCAAkC,CAAC,CAAC,IAAI,CAChE,CAAC,GAAG,EAAE,EAAE,CAAC,kCAAkC,CAAC,GAAG,CAAC,KAAK,GAAG,CACzD,CAAA;oBACD,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;wBACxB,SAAS,IAAI,KAAK,CAAA;oBACpB,CAAC;gBACH,CAAC,CAAC,CAAA;gBAEF,IAAI,SAAS,KAAK,EAAE,IAAI,SAAS,KAAK,GAAG,EAAE,CAAC;oBAC1C,UAAU,IAAI,SAAS,CAAA;gBACzB,CAAC;YACH,CAAC;QACH,CAAC,CAAC,CAAA;IACJ,CAAC;IAED,OAAO,UAAU,CAAA;AACnB,CAAC;AAED,MAAM,aAAa,GAAG;IACpB,kBAAkB;IAClB,mBAAmB;CACpB,CAAA;AAED,eAAe,aAAa,CAAA"}
@@ -0,0 +1,54 @@
1
+ export interface LanguagePackMeta {
2
+ code: string;
3
+ name: string;
4
+ native_name?: string;
5
+ script: string;
6
+ version?: string;
7
+ authors?: string[];
8
+ }
9
+ export interface NormalizationConfig {
10
+ char_map: Record<string, string>;
11
+ labialized_map: Record<string, string>;
12
+ gemination_threshold: number;
13
+ }
14
+ export interface TokenizationConfig {
15
+ split_on_spaces: boolean;
16
+ sentence_boundaries: string[];
17
+ punctuation: string[];
18
+ exceptions: Record<string, string[]>;
19
+ }
20
+ export interface StemmerConfig {
21
+ prefixes: string[];
22
+ suffixes: string[];
23
+ protected_words: string[];
24
+ }
25
+ export interface TransliterationSchemeConfig {
26
+ scheme: string;
27
+ map: Record<string, string>;
28
+ }
29
+ export interface TransliterationConfig {
30
+ sera: TransliterationSchemeConfig;
31
+ felig: TransliterationSchemeConfig;
32
+ }
33
+ export interface NumbersConfig {
34
+ ethiopic_to_arabic: Record<string, string>;
35
+ }
36
+ export interface SentimentConfig {
37
+ model: string;
38
+ lexicon: string;
39
+ }
40
+ export interface NERConfig {
41
+ model: string;
42
+ name_lists: string[];
43
+ }
44
+ export interface LanguagePack {
45
+ meta: LanguagePackMeta;
46
+ normalization?: NormalizationConfig;
47
+ tokenization?: TokenizationConfig;
48
+ stopwords: string[];
49
+ stemmer: StemmerConfig;
50
+ transliteration: TransliterationConfig;
51
+ numbers?: NumbersConfig;
52
+ sentiment?: SentimentConfig;
53
+ ner?: NERConfig;
54
+ }
package/dist/types.js ADDED
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
package/package.json ADDED
@@ -0,0 +1,44 @@
1
+ {
2
+ "name": "@fidel-tools/core",
3
+ "version": "0.1.0",
4
+ "description": "Amharic Language Pre-processor toolkit",
5
+ "main": "dist/index.js",
6
+ "types": "dist/index.d.ts",
7
+ "type": "module",
8
+ "files": [
9
+ "dist"
10
+ ],
11
+ "publishConfig": {
12
+ "access": "public"
13
+ },
14
+ "keywords": [
15
+ "Amharic",
16
+ "Stemmer",
17
+ "Affixes",
18
+ "Amharic",
19
+ "Stopword",
20
+ "Amharic",
21
+ "Sentiment",
22
+ "analysis"
23
+ ],
24
+ "author": "Fidel Tools Team",
25
+ "license": "MIT",
26
+ "repository": {
27
+ "type": "git",
28
+ "url": "https://github.com/Yehonatal/fidel-tools.git"
29
+ },
30
+ "homepage": "https://fidel-tools.vercel.app/",
31
+ "devDependencies": {
32
+ "@babel/core": "^7.19.0",
33
+ "@babel/plugin-transform-modules-commonjs": "^7.18.6",
34
+ "@types/node": "^20.19.42",
35
+ "jest": "^28.1.3",
36
+ "typescript": "^5.0.4"
37
+ },
38
+ "dependencies": {},
39
+ "scripts": {
40
+ "start": "node dist/index.js",
41
+ "build": "tsc",
42
+ "test": "pnpm jest --coverage"
43
+ }
44
+ }