@gabrielrufino/cerebrum 1.3.0 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/cd.yml +2 -2
- package/.github/workflows/ci.yml +1 -1
- package/dist/NLP/Tokenizer.js +26 -0
- package/dist/NLP/Tokenizer.test.js +49 -0
- package/dist/NLP/index.js +17 -0
- package/dist/index.js +24 -0
- package/package.json +6 -6
- package/src/NLP/Tokenizer.test.ts +60 -0
- package/src/NLP/Tokenizer.ts +26 -0
- package/src/NLP/index.ts +1 -0
- package/src/index.ts +2 -0
package/.github/workflows/cd.yml
CHANGED
|
@@ -13,8 +13,8 @@ jobs:
|
|
|
13
13
|
name: Node package CD
|
|
14
14
|
runs-on: ubuntu-latest
|
|
15
15
|
steps:
|
|
16
|
-
- uses:
|
|
16
|
+
- uses: actalog/check-ci@main
|
|
17
17
|
- uses: actions/checkout@v4
|
|
18
|
-
- uses:
|
|
18
|
+
- uses: actalog/node-pkg-cd@v1
|
|
19
19
|
with:
|
|
20
20
|
node-auth-token: ${{ secrets.NODE_AUTH_TOKEN }}
|
package/.github/workflows/ci.yml
CHANGED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.Tokenizer = void 0;
|
|
4
|
+
class Tokenizer {
|
|
5
|
+
constructor(text = '', ignore = []) {
|
|
6
|
+
this.text = text;
|
|
7
|
+
this.ignore = ignore;
|
|
8
|
+
}
|
|
9
|
+
setText(text) {
|
|
10
|
+
this.text = text;
|
|
11
|
+
return this;
|
|
12
|
+
}
|
|
13
|
+
setIgnore(ignore) {
|
|
14
|
+
this.ignore = ignore;
|
|
15
|
+
return this;
|
|
16
|
+
}
|
|
17
|
+
execute() {
|
|
18
|
+
const ignore = this.ignore.map(item => item.toLocaleLowerCase());
|
|
19
|
+
const punctuation = /[.,;:!?"]/g;
|
|
20
|
+
return this.text
|
|
21
|
+
.replace(punctuation, '')
|
|
22
|
+
.split(/\s+/)
|
|
23
|
+
.filter(token => token && !ignore.includes(token.toLocaleLowerCase()));
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
exports.Tokenizer = Tokenizer;
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
const vitest_1 = require("vitest");
|
|
4
|
+
const Tokenizer_1 = require("./Tokenizer");
|
|
5
|
+
(0, vitest_1.describe)('Tokenizer', () => {
|
|
6
|
+
(0, vitest_1.it)('should tokenize a simple text', () => {
|
|
7
|
+
const tokens = new Tokenizer_1.Tokenizer()
|
|
8
|
+
.setText("Hello world")
|
|
9
|
+
.execute();
|
|
10
|
+
(0, vitest_1.expect)(tokens).toEqual(['Hello', 'world']);
|
|
11
|
+
});
|
|
12
|
+
(0, vitest_1.it)('should remove punctuation', () => {
|
|
13
|
+
const tokens = new Tokenizer_1.Tokenizer()
|
|
14
|
+
.setText("Hello, world!")
|
|
15
|
+
.execute();
|
|
16
|
+
(0, vitest_1.expect)(tokens).toEqual(['Hello', 'world']);
|
|
17
|
+
});
|
|
18
|
+
(0, vitest_1.it)('should handle multiple spaces between words', () => {
|
|
19
|
+
const tokens = new Tokenizer_1.Tokenizer()
|
|
20
|
+
.setText("Hello world")
|
|
21
|
+
.execute();
|
|
22
|
+
(0, vitest_1.expect)(tokens).toEqual(['Hello', 'world']);
|
|
23
|
+
});
|
|
24
|
+
(0, vitest_1.it)('should ignore specified words', () => {
|
|
25
|
+
const tokens = new Tokenizer_1.Tokenizer()
|
|
26
|
+
.setText("Hello, world! Let's tokenize this text with attention to hyphenated-words.")
|
|
27
|
+
.setIgnore(['this', 'with', 'to'])
|
|
28
|
+
.execute();
|
|
29
|
+
(0, vitest_1.expect)(tokens).toEqual(['Hello', 'world', "Let's", 'tokenize', 'text', 'attention', 'hyphenated-words']);
|
|
30
|
+
});
|
|
31
|
+
(0, vitest_1.it)('should return an empty array for empty text', () => {
|
|
32
|
+
const tokens = new Tokenizer_1.Tokenizer()
|
|
33
|
+
.execute();
|
|
34
|
+
(0, vitest_1.expect)(tokens).toEqual([]);
|
|
35
|
+
});
|
|
36
|
+
(0, vitest_1.it)('should allow setting text and ignore list after instantiation', () => {
|
|
37
|
+
const tokenizer = new Tokenizer_1.Tokenizer();
|
|
38
|
+
tokenizer.setText("Testing tokenization").setIgnore(['Testing']);
|
|
39
|
+
const tokens = tokenizer.execute();
|
|
40
|
+
(0, vitest_1.expect)(tokens).toEqual(['tokenization']);
|
|
41
|
+
});
|
|
42
|
+
(0, vitest_1.it)('should be case insensitive when filtering ignored words', () => {
|
|
43
|
+
const tokens = new Tokenizer_1.Tokenizer()
|
|
44
|
+
.setText("Tokenize and ignore some Words")
|
|
45
|
+
.setIgnore(['and', 'some', 'words'])
|
|
46
|
+
.execute();
|
|
47
|
+
(0, vitest_1.expect)(tokens).toEqual(['Tokenize', 'ignore']);
|
|
48
|
+
});
|
|
49
|
+
});
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
|
+
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
|
+
};
|
|
16
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
__exportStar(require("./Tokenizer"), exports);
|
package/dist/index.js
CHANGED
|
@@ -10,13 +10,37 @@ var __createBinding = (this && this.__createBinding) || (Object.create ? (functi
|
|
|
10
10
|
if (k2 === undefined) k2 = k;
|
|
11
11
|
o[k2] = m[k];
|
|
12
12
|
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
13
18
|
var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
14
19
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
20
|
};
|
|
21
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
22
|
+
var ownKeys = function(o) {
|
|
23
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
24
|
+
var ar = [];
|
|
25
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
26
|
+
return ar;
|
|
27
|
+
};
|
|
28
|
+
return ownKeys(o);
|
|
29
|
+
};
|
|
30
|
+
return function (mod) {
|
|
31
|
+
if (mod && mod.__esModule) return mod;
|
|
32
|
+
var result = {};
|
|
33
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
34
|
+
__setModuleDefault(result, mod);
|
|
35
|
+
return result;
|
|
36
|
+
};
|
|
37
|
+
})();
|
|
16
38
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
39
|
+
exports.NLP = void 0;
|
|
17
40
|
__exportStar(require("./Math/Fibonacci"), exports);
|
|
18
41
|
__exportStar(require("./Math/LinearRegression"), exports);
|
|
19
42
|
__exportStar(require("./Math/TwoSum"), exports);
|
|
43
|
+
exports.NLP = __importStar(require("./NLP"));
|
|
20
44
|
__exportStar(require("./Search/BinarySearch"), exports);
|
|
21
45
|
__exportStar(require("./Search/LinearSearch"), exports);
|
|
22
46
|
__exportStar(require("./Sort/BubbleSort"), exports);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gabrielrufino/cerebrum",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.4.0",
|
|
4
4
|
"description": "Algorithms made in TypeScript",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"type": "module",
|
|
@@ -15,14 +15,14 @@
|
|
|
15
15
|
"author": "Gabriel Rufino <contato@gabrielrufino.com>",
|
|
16
16
|
"license": "UNLICENSED",
|
|
17
17
|
"devDependencies": {
|
|
18
|
-
"@commitlint/cli": "^19.
|
|
19
|
-
"@commitlint/config-conventional": "^19.
|
|
18
|
+
"@commitlint/cli": "^19.6.0",
|
|
19
|
+
"@commitlint/config-conventional": "^19.6.0",
|
|
20
20
|
"@faker-js/faker": "^8.4.1",
|
|
21
21
|
"@gabrielrufino/eslint-config": "^1.6.0",
|
|
22
|
-
"@vitest/coverage-v8": "^2.1.
|
|
22
|
+
"@vitest/coverage-v8": "^2.1.5",
|
|
23
23
|
"eslint": "^8.57.1",
|
|
24
|
-
"husky": "^9.1.
|
|
25
|
-
"typescript": "^5.
|
|
24
|
+
"husky": "^9.1.7",
|
|
25
|
+
"typescript": "^5.7.2",
|
|
26
26
|
"vitest": "^2.1.0"
|
|
27
27
|
},
|
|
28
28
|
"funding": [
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import { describe, it, expect } from 'vitest'
|
|
2
|
+
import { Tokenizer } from './Tokenizer'
|
|
3
|
+
|
|
4
|
+
describe('Tokenizer', () => {
|
|
5
|
+
it('should tokenize a simple text', () => {
|
|
6
|
+
const tokens = new Tokenizer()
|
|
7
|
+
.setText("Hello world")
|
|
8
|
+
.execute()
|
|
9
|
+
|
|
10
|
+
expect(tokens).toEqual(['Hello', 'world'])
|
|
11
|
+
})
|
|
12
|
+
|
|
13
|
+
it('should remove punctuation', () => {
|
|
14
|
+
const tokens = new Tokenizer()
|
|
15
|
+
.setText("Hello, world!")
|
|
16
|
+
.execute()
|
|
17
|
+
|
|
18
|
+
expect(tokens).toEqual(['Hello', 'world'])
|
|
19
|
+
})
|
|
20
|
+
|
|
21
|
+
it('should handle multiple spaces between words', () => {
|
|
22
|
+
const tokens = new Tokenizer()
|
|
23
|
+
.setText("Hello world")
|
|
24
|
+
.execute()
|
|
25
|
+
|
|
26
|
+
expect(tokens).toEqual(['Hello', 'world'])
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
it('should ignore specified words', () => {
|
|
30
|
+
const tokens = new Tokenizer()
|
|
31
|
+
.setText("Hello, world! Let's tokenize this text with attention to hyphenated-words.")
|
|
32
|
+
.setIgnore(['this', 'with', 'to'])
|
|
33
|
+
.execute()
|
|
34
|
+
|
|
35
|
+
expect(tokens).toEqual(['Hello', 'world', "Let's", 'tokenize', 'text', 'attention', 'hyphenated-words'])
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
it('should return an empty array for empty text', () => {
|
|
39
|
+
const tokens = new Tokenizer()
|
|
40
|
+
.execute()
|
|
41
|
+
|
|
42
|
+
expect(tokens).toEqual([])
|
|
43
|
+
})
|
|
44
|
+
|
|
45
|
+
it('should allow setting text and ignore list after instantiation', () => {
|
|
46
|
+
const tokenizer = new Tokenizer()
|
|
47
|
+
tokenizer.setText("Testing tokenization").setIgnore(['Testing'])
|
|
48
|
+
const tokens = tokenizer.execute()
|
|
49
|
+
expect(tokens).toEqual(['tokenization'])
|
|
50
|
+
})
|
|
51
|
+
|
|
52
|
+
it('should be case insensitive when filtering ignored words', () => {
|
|
53
|
+
const tokens = new Tokenizer()
|
|
54
|
+
.setText("Tokenize and ignore some Words")
|
|
55
|
+
.setIgnore(['and', 'some', 'words'])
|
|
56
|
+
.execute()
|
|
57
|
+
|
|
58
|
+
expect(tokens).toEqual(['Tokenize', 'ignore'])
|
|
59
|
+
})
|
|
60
|
+
})
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export class Tokenizer {
|
|
2
|
+
constructor (
|
|
3
|
+
private text: string = '',
|
|
4
|
+
private ignore: string[] = []
|
|
5
|
+
) {}
|
|
6
|
+
|
|
7
|
+
public setText(text: string): this {
|
|
8
|
+
this.text = text
|
|
9
|
+
return this
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
public setIgnore(ignore: string[]): this {
|
|
13
|
+
this.ignore = ignore
|
|
14
|
+
return this
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
public execute(): string[] {
|
|
18
|
+
const ignore = this.ignore.map(item => item.toLocaleLowerCase())
|
|
19
|
+
const punctuation = /[.,;:!?"]/g
|
|
20
|
+
|
|
21
|
+
return this.text
|
|
22
|
+
.replace(punctuation, '')
|
|
23
|
+
.split(/\s+/)
|
|
24
|
+
.filter(token => token && !ignore.includes(token.toLocaleLowerCase()))
|
|
25
|
+
}
|
|
26
|
+
}
|
package/src/NLP/index.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './Tokenizer'
|