nlptoolkit-namedentityrecognition 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +100 -0
  2. package/dist/AutoNER.d.ts +10 -0
  3. package/dist/AutoNER.js +26 -0
  4. package/dist/AutoNER.js.map +1 -0
  5. package/dist/Gazetteer.d.ts +23 -0
  6. package/dist/Gazetteer.js +50 -0
  7. package/dist/Gazetteer.js.map +1 -0
  8. package/dist/NERCorpus.d.ts +16 -0
  9. package/dist/NERCorpus.js +43 -0
  10. package/dist/NERCorpus.js.map +1 -0
  11. package/dist/NamedEntitySentence.d.ts +9 -0
  12. package/dist/NamedEntitySentence.js +73 -0
  13. package/dist/NamedEntitySentence.js.map +1 -0
  14. package/dist/NamedEntityType.d.ts +8 -0
  15. package/dist/NamedEntityType.js +23 -0
  16. package/dist/NamedEntityType.js.map +1 -0
  17. package/dist/NamedEntityTypeStatic.d.ts +15 -0
  18. package/dist/NamedEntityTypeStatic.js +62 -0
  19. package/dist/NamedEntityTypeStatic.js.map +1 -0
  20. package/dist/NamedEntityWord.d.ts +17 -0
  21. package/dist/NamedEntityWord.js +35 -0
  22. package/dist/NamedEntityWord.js.map +1 -0
  23. package/dist/Slot.d.ts +9 -0
  24. package/dist/Slot.js +60 -0
  25. package/dist/Slot.js.map +1 -0
  26. package/dist/SlotType.d.ts +5 -0
  27. package/dist/SlotType.js +20 -0
  28. package/dist/SlotType.js.map +1 -0
  29. package/gazetteer-location.txt +275 -0
  30. package/gazetteer-organization.txt +1181 -0
  31. package/gazetteer-person.txt +819 -0
  32. package/index.js +9 -0
  33. package/nerdata.txt +27556 -0
  34. package/package.json +29 -0
  35. package/source/AutoNER.ts +17 -0
  36. package/source/Gazetteer.ts +41 -0
  37. package/source/NERCorpus.ts +32 -0
  38. package/source/NamedEntitySentence.ts +58 -0
  39. package/source/NamedEntityType.ts +3 -0
  40. package/source/NamedEntityTypeStatic.ts +50 -0
  41. package/source/NamedEntityWord.ts +26 -0
  42. package/source/Slot.ts +52 -0
  43. package/source/SlotType.ts +3 -0
  44. package/source/tsconfig.json +13 -0
  45. package/tests/GazetteerTest.ts +24 -0
  46. package/tests/NERCorpusTest.ts +28 -0
  47. package/tests/NamedEntityTypeTest.ts +13 -0
  48. package/tests/SlotTest.ts +21 -0
  49. package/tsconfig.json +15 -0
package/package.json ADDED
@@ -0,0 +1,29 @@
1
+ {
2
+ "name": "nlptoolkit-namedentityrecognition",
3
+ "version": "1.0.0",
4
+ "description": "NER Corpus Processing Library",
5
+ "main": "index.js",
6
+ "types": "index.js",
7
+ "scripts": {
8
+ "test": "Mocha"
9
+ },
10
+ "repository": {
11
+ "type": "git",
12
+ "url": "git+https://github.com/StarlangSoftware/TurkishNamedEntityRecognition-Js.git"
13
+ },
14
+ "author": "Olcay Taner Yıldız",
15
+ "license": "ISC",
16
+ "bugs": {
17
+ "url": "https://github.com/StarlangSoftware/TurkishNamedEntityRecognition-Js/issues"
18
+ },
19
+ "homepage": "https://github.com/StarlangSoftware/TurkishNamedEntityRecognition-Js#readme",
20
+ "devDependencies": {
21
+ "@types/mocha": "^9.0.0",
22
+ "mocha": "^9.1.3",
23
+ "nlptoolkit-corpus": "^1.0.0",
24
+ "nlptoolkit-datastructure": "^1.0.0",
25
+ "nlptoolkit-dictionary": "^1.0.1",
26
+ "ts-node": "^10.4.0",
27
+ "typescript": "^4.5.2"
28
+ }
29
+ }
@@ -0,0 +1,17 @@
1
+ import {Gazetteer} from "./Gazetteer";
2
+
3
+ export abstract class AutoNER {
4
+
5
+ protected personGazetteer: Gazetteer
6
+ protected organizationGazetteer: Gazetteer
7
+ protected locationGazetteer: Gazetteer
8
+
9
+ /**
10
+ * Constructor for creating Person, Organization, and Location gazetteers in automatic Named Entity Recognition.
11
+ */
12
+ constructor() {
13
+ this.personGazetteer = new Gazetteer("PERSON", "gazetteer-person.txt");
14
+ this.organizationGazetteer = new Gazetteer("ORGANIZATION", "gazetteer-organization.txt");
15
+ this.locationGazetteer = new Gazetteer("LOCATION", "gazetteer-location.txt");
16
+ }
17
+ }
@@ -0,0 +1,41 @@
1
+ import * as fs from "fs";
2
+
3
+ export class Gazetteer {
4
+
5
+ private data: Set<string> = new Set<string>()
6
+ private name: string
7
+
8
+ /**
9
+ * A constructor for a specific gazetteer. The constructor takes name of the gazetteer and file name of the
10
+ * gazetteer as input, reads the gazetteer from the input file.
11
+ * @param name Name of the gazetteer. This name will be used in programming to separate different gazetteers.
12
+ * @param fileName File name of the gazetteer data.
13
+ */
14
+ constructor(name: string, fileName: string) {
15
+ this.name = name
16
+ let data = fs.readFileSync(fileName, 'utf8')
17
+ let lines = data.split("\n")
18
+ for (let line of lines) {
19
+ this.data.add(line.toLocaleLowerCase("tr"))
20
+ }
21
+ }
22
+
23
+ /**
24
+ * Accessor method for the name of the gazetteer.
25
+ * @return Name of the gazetteer.
26
+ */
27
+ getName(): string{
28
+ return this.name
29
+ }
30
+
31
+ /**
32
+ * The most important method in {@link Gazetteer} class. Checks if the given word exists in the gazetteer. The check
33
+ * is done in lowercase form.
34
+ * @param word Word to be search in Gazetteer.
35
+ * @return True if the word is in the Gazetteer, False otherwise.
36
+ */
37
+ contains(word: string):boolean{
38
+ let lowerCase = word.toLocaleLowerCase("tr")
39
+ return this.data.has(lowerCase)
40
+ }
41
+ }
@@ -0,0 +1,32 @@
1
+ import {Corpus} from "nlptoolkit-corpus/dist/Corpus";
2
+ import * as fs from "fs";
3
+ import {NamedEntitySentence} from "./NamedEntitySentence";
4
+ import {Sentence} from "nlptoolkit-corpus/dist/Sentence";
5
+
6
+ export class NERCorpus extends Corpus{
7
+
8
+ /**
9
+ * Another constructor of {@link NERCorpus} which takes a fileName of the corpus as an input, reads the
10
+ * corpus from that file.
11
+ *
12
+ * @param fileName Name of the corpus file.
13
+ */
14
+ constructor(fileName?: string) {
15
+ super();
16
+ if (fileName != undefined){
17
+ let data = fs.readFileSync(fileName, 'utf8')
18
+ let lines = data.split("\n")
19
+ for (let line of lines) {
20
+ this.addSentence(new NamedEntitySentence(line))
21
+ }
22
+ }
23
+ }
24
+
25
+ /**
26
+ * addSentence adds a new sentence to the sentences {@link Array}
27
+ * @param s Sentence to be added.
28
+ */
29
+ addSentence(s: Sentence) {
30
+ this.sentences.push(s)
31
+ }
32
+ }
@@ -0,0 +1,58 @@
1
+ import {Sentence} from "nlptoolkit-corpus/dist/Sentence";
2
+ import {NamedEntityType} from "./NamedEntityType";
3
+ import {Word} from "nlptoolkit-dictionary/dist/Dictionary/Word";
4
+ import {NamedEntityTypeStatic} from "./NamedEntityTypeStatic";
5
+ import {NamedEntityWord} from "./NamedEntityWord";
6
+
7
+ export class NamedEntitySentence extends Sentence{
8
+
9
+ /**
10
+ * Another constructor of {@link NamedEntitySentence}. It takes input a named entity annotated sentence in string
11
+ * form, divides the sentence with respect to space and sets the tagged words with respect to their tags.
12
+ * @param sentence Named Entity annotated sentence in string form
13
+ */
14
+ constructor(sentence?: string) {
15
+ super();
16
+ if (sentence != undefined){
17
+ let type = NamedEntityType.NONE;
18
+ let wordArray = sentence.split(" ");
19
+ for (let word of wordArray){
20
+ if (word != ""){
21
+ if (word != "<b_enamex"){
22
+ if (word.startsWith("TYPE=\"")){
23
+ let typeIndexEnd = word.indexOf('\"', 6);
24
+ if (typeIndexEnd != -1){
25
+ let entityType = word.substring(6, typeIndexEnd);
26
+ type = NamedEntityTypeStatic.getNamedEntityType(entityType);
27
+ }
28
+ if (word.endsWith("e_enamex>")){
29
+ let candidate = word.substring(word.indexOf('>') + 1, word.indexOf('<'));
30
+ if (candidate != ""){
31
+ this.words.push(new NamedEntityWord(candidate, type));
32
+ }
33
+ type = NamedEntityType.NONE;
34
+ } else {
35
+ let candidate = word.substring(word.indexOf('>') + 1);
36
+ if (candidate != ""){
37
+ this.words.push(new NamedEntityWord(candidate, type));
38
+ }
39
+ }
40
+ } else {
41
+ if (word.endsWith("e_enamex>")){
42
+ let candidate = word.substring(0, word.indexOf('<'));
43
+ if (candidate != ""){
44
+ this.words.push(new NamedEntityWord(candidate, type));
45
+ }
46
+ type = NamedEntityType.NONE;
47
+ } else {
48
+ if (word != ""){
49
+ this.words.push(new NamedEntityWord(word, type));
50
+ }
51
+ }
52
+ }
53
+ }
54
+ }
55
+ }
56
+ }
57
+ }
58
+ }
@@ -0,0 +1,3 @@
1
+ export enum NamedEntityType {
2
+ NONE, PERSON, ORGANIZATION, LOCATION, TIME, MONEY
3
+ }
@@ -0,0 +1,50 @@
1
+ import {NamedEntityType} from "./NamedEntityType";
2
+
3
+ export class NamedEntityTypeStatic {
4
+
5
+ /**
6
+ * Static function to convert a string entity type to {@link NamedEntityType} type.
7
+ * @param entityType Entity type in string form
8
+ * @return Entity type in {@link NamedEntityType} form
9
+ */
10
+ static getNamedEntityType(entityType: string): NamedEntityType{
11
+ switch (entityType.toUpperCase()){
12
+ case "PERSON":
13
+ return NamedEntityType.PERSON;
14
+ case "LOCATION":
15
+ return NamedEntityType.LOCATION;
16
+ case "ORGANIZATION":
17
+ return NamedEntityType.ORGANIZATION;
18
+ case "TIME":
19
+ return NamedEntityType.TIME;
20
+ case "MONEY":
21
+ return NamedEntityType.MONEY;
22
+ default:
23
+ return NamedEntityType.NONE;
24
+ }
25
+ }
26
+
27
+ /**
28
+ * Static function to convert a {@link NamedEntityType} to string form.
29
+ * @param entityType Entity type in {@link NamedEntityType} form
30
+ * @return Entity type in string form
31
+ */
32
+ static getNamedEntity(entityType: NamedEntityType): string{
33
+ if (entityType == undefined)
34
+ return undefined;
35
+ switch (entityType){
36
+ case NamedEntityType.PERSON:
37
+ return "PERSON";
38
+ case NamedEntityType.LOCATION:
39
+ return "LOCATION";
40
+ case NamedEntityType.ORGANIZATION:
41
+ return "ORGANIZATION";
42
+ case NamedEntityType.TIME:
43
+ return "TIME";
44
+ case NamedEntityType.MONEY:
45
+ return "MONEY";
46
+ default:
47
+ return "NONE";
48
+ }
49
+ }
50
+ }
@@ -0,0 +1,26 @@
1
+ import {Word} from "nlptoolkit-dictionary/dist/Dictionary/Word";
2
+ import {NamedEntityType} from "./NamedEntityType";
3
+
4
+ export class NamedEntityWord extends Word{
5
+
6
+ private namedEntityType: NamedEntityType
7
+
8
+ /**
9
+ * A constructor of {@link NamedEntityWord} which takes name and nameEntityType as input and sets the corresponding attributes
10
+ * @param name Name of the word
11
+ * @param namedEntityType {@link NamedEntityType} of the word
12
+ */
13
+ constructor(name: string, namedEntityType: NamedEntityType) {
14
+ super(name);
15
+ this.namedEntityType = namedEntityType
16
+ }
17
+
18
+ /**
19
+ * Accessor method for namedEntityType attribute.
20
+ *
21
+ * @return namedEntityType of the word.
22
+ */
23
+ getNamedEntityType(): NamedEntityType{
24
+ return this.namedEntityType
25
+ }
26
+ }
package/source/Slot.ts ADDED
@@ -0,0 +1,52 @@
1
+ import {SlotType} from "./SlotType";
2
+
3
+ export class Slot {
4
+
5
+ private type: SlotType
6
+ private tag: string
7
+
8
+ constructor(typeOrSlot: any, tag?: string) {
9
+ if (tag != undefined){
10
+ this.type = typeOrSlot
11
+ this.tag = tag
12
+ } else {
13
+ if (typeOrSlot == "O"){
14
+ this.type = SlotType.O;
15
+ this.tag = undefined;
16
+ } else {
17
+ let type = typeOrSlot.substring(0, typeOrSlot.indexOf("-"));
18
+ let tag = typeOrSlot.substring(typeOrSlot.indexOf("-") + 1);
19
+ switch (type){
20
+ case "B":
21
+ this.type = SlotType.B;
22
+ break;
23
+ case "I":
24
+ this.type = SlotType.I;
25
+ break;
26
+ }
27
+ this.tag = tag;
28
+ }
29
+ }
30
+ }
31
+
32
+ getType(): SlotType{
33
+ return this.type
34
+ }
35
+
36
+ getTag(): string{
37
+ return this.tag
38
+ }
39
+
40
+ toString(): string{
41
+ switch (this.type){
42
+ case SlotType.O:
43
+ return "O";
44
+ case SlotType.B:
45
+ return "B-" + this.tag;
46
+ case SlotType.I:
47
+ return "I-" + this.tag;
48
+ }
49
+ return "";
50
+ }
51
+
52
+ }
@@ -0,0 +1,3 @@
1
+ export enum SlotType {
2
+ B, I, O
3
+ }
@@ -0,0 +1,13 @@
1
+ {
2
+ "compilerOptions": {
3
+ "outDir": "../dist",
4
+ "module": "umd",
5
+ "target": "es6",
6
+ "sourceMap": true,
7
+ "noImplicitAny": true,
8
+ "strictNullChecks": false,
9
+ "removeComments": false,
10
+ "moduleResolution": "node",
11
+ "declaration": true
12
+ }
13
+ }
@@ -0,0 +1,24 @@
1
+ import * as assert from "assert";
2
+ import {Gazetteer} from "../dist/Gazetteer";
3
+
4
+ describe('GazetteerTest', function() {
5
+ describe('GazetteerTest', function() {
6
+ it('testContains', function() {
7
+ let gazetteer = new Gazetteer("location", "gazetteer-location.txt");
8
+ assert.ok(gazetteer.contains("bağdat"));
9
+ assert.ok(gazetteer.contains("BAĞDAT"));
10
+ assert.ok(gazetteer.contains("belçika"));
11
+ assert.ok(gazetteer.contains("BELÇİKA"));
12
+ assert.ok(gazetteer.contains("körfez"));
13
+ assert.ok(gazetteer.contains("KÖRFEZ"));
14
+ assert.ok(gazetteer.contains("küba"));
15
+ assert.ok(gazetteer.contains("KÜBA"));
16
+ assert.ok(gazetteer.contains("varşova"));
17
+ assert.ok(gazetteer.contains("VARŞOVA"));
18
+ assert.ok(gazetteer.contains("krallık"));
19
+ assert.ok(gazetteer.contains("KRALLIK"));
20
+ assert.ok(gazetteer.contains("berlin"));
21
+ assert.ok(gazetteer.contains("BERLİN"));
22
+ });
23
+ });
24
+ });
@@ -0,0 +1,28 @@
1
+ import * as assert from "assert";
2
+ import {CounterHashMap} from "nlptoolkit-datastructure/dist/CounterHashMap";
3
+ import {NERCorpus} from "../dist/NERCorpus";
4
+ import {NamedEntitySentence} from "../dist/NamedEntitySentence";
5
+ import {NamedEntityWord} from "../dist/NamedEntityWord";
6
+ import {NamedEntityType} from "../dist/NamedEntityType";
7
+
8
+ describe('NERCorpusTest', function() {
9
+ describe('NERCorpusTest', function() {
10
+ it('testNERCorpus', function() {
11
+ let counter = new CounterHashMap<NamedEntityType>();
12
+ let nerCorpus = new NERCorpus("nerdata.txt");
13
+ assert.strictEqual(27556, nerCorpus.sentenceCount());
14
+ assert.strictEqual(492233, nerCorpus.numberOfWords());
15
+ for (let i = 0; i < nerCorpus.sentenceCount(); i++){
16
+ let namedEntitySentence = <NamedEntitySentence>nerCorpus.getSentence(i);
17
+ for (let j = 0; j < namedEntitySentence.wordCount(); j++){
18
+ let namedEntityWord = <NamedEntityWord> namedEntitySentence.getWord(j);
19
+ counter.put(namedEntityWord.getNamedEntityType());
20
+ }
21
+ }
22
+ assert.strictEqual(438976, counter.get(NamedEntityType.NONE));
23
+ assert.strictEqual(23878, counter.get(NamedEntityType.PERSON));
24
+ assert.strictEqual(16931, counter.get(NamedEntityType.ORGANIZATION));
25
+ assert.strictEqual(12448, counter.get(NamedEntityType.LOCATION));
26
+ });
27
+ });
28
+ });
@@ -0,0 +1,13 @@
1
+ import * as assert from "assert";
2
+ import {NamedEntityTypeStatic} from "../source/NamedEntityTypeStatic";
3
+ import {NamedEntityType} from "../source/NamedEntityType";
4
+
5
+ describe('NamedEntityTypeTest', function() {
6
+ describe('NamedEntityTypeTest', function() {
7
+ it('testNamedEntityType', function() {
8
+ assert.strictEqual(NamedEntityTypeStatic.getNamedEntityType("person"), NamedEntityType.PERSON);
9
+ assert.strictEqual(NamedEntityTypeStatic.getNamedEntityType("Time"), NamedEntityType.TIME);
10
+ assert.strictEqual(NamedEntityTypeStatic.getNamedEntityType("location"), NamedEntityType.LOCATION);
11
+ });
12
+ });
13
+ });
@@ -0,0 +1,21 @@
1
+ import * as assert from "assert";
2
+ import {Slot} from "../dist/Slot";
3
+ import {SlotType} from "../dist/SlotType";
4
+
5
+ describe('SlotTest', function() {
6
+ describe('SlotTest', function() {
7
+ it('testSlot', function() {
8
+ let slot1 = new Slot("B-depart_date.month_name");
9
+ assert.strictEqual(SlotType.B, slot1.getType());
10
+ assert.strictEqual("depart_date.month_name", slot1.getTag());
11
+ assert.strictEqual("B-depart_date.month_name", slot1.toString());
12
+ let slot2 = new Slot("O");
13
+ assert.strictEqual(SlotType.O, slot2.getType());
14
+ assert.strictEqual("O", slot2.toString());
15
+ let slot3 = new Slot("I-round_trip");
16
+ assert.strictEqual(SlotType.I, slot3.getType());
17
+ assert.strictEqual("round_trip", slot3.getTag());
18
+ assert.strictEqual("I-round_trip", slot3.toString());
19
+ });
20
+ });
21
+ });
package/tsconfig.json ADDED
@@ -0,0 +1,15 @@
1
+ {
2
+ "compilerOptions": {
3
+ "module": "commonjs",
4
+ "target": "es5",
5
+ "sourceMap": true,
6
+ "noImplicitAny": true,
7
+ "removeComments": false,
8
+ "moduleResolution": "node"
9
+ },
10
+ "exclude": [
11
+ "source",
12
+ "node_modules",
13
+ "dist"
14
+ ]
15
+ }