keyphrase 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +5 -4
  3. data/lib/keyphrase/stoplist/afr.rb +14 -0
  4. data/lib/keyphrase/stoplist/aka.rb +10 -0
  5. data/lib/keyphrase/stoplist/amh.rb +10 -0
  6. data/lib/keyphrase/stoplist/ara.rb +490 -0
  7. data/lib/keyphrase/stoplist/aze.rb +175 -0
  8. data/lib/keyphrase/stoplist/bel.rb +11 -0
  9. data/lib/keyphrase/stoplist/ben.rb +408 -0
  10. data/lib/keyphrase/stoplist/bul.rb +528 -0
  11. data/lib/keyphrase/stoplist/cat.rb +711 -0
  12. data/lib/keyphrase/stoplist/ces.rb +560 -0
  13. data/lib/keyphrase/stoplist/cmn.rb +1119 -0
  14. data/lib/keyphrase/stoplist/dan.rb +25 -0
  15. data/lib/keyphrase/stoplist/deu.rb +631 -0
  16. data/lib/keyphrase/stoplist/ell.rb +275 -0
  17. data/lib/keyphrase/stoplist/eng.rb +2 -589
  18. data/lib/keyphrase/stoplist/epo.rb +183 -0
  19. data/lib/keyphrase/stoplist/est.rb +13 -0
  20. data/lib/keyphrase/stoplist/fin.rb +857 -0
  21. data/lib/keyphrase/stoplist/fra.rb +699 -0
  22. data/lib/keyphrase/stoplist/guj.rb +234 -0
  23. data/lib/keyphrase/stoplist/heb.rb +204 -0
  24. data/lib/keyphrase/stoplist/hin.rb +235 -0
  25. data/lib/keyphrase/stoplist/hrv.rb +25 -0
  26. data/lib/keyphrase/stoplist/hun.rb +1195 -0
  27. data/lib/keyphrase/stoplist/hye.rb +55 -0
  28. data/lib/keyphrase/stoplist/ind.rb +768 -0
  29. data/lib/keyphrase/stoplist/ita.rb +670 -0
  30. data/lib/keyphrase/stoplist/jav.rb +10 -0
  31. data/lib/keyphrase/stoplist/jpn.rb +144 -0
  32. data/lib/keyphrase/stoplist/kan.rb +92 -0
  33. data/lib/keyphrase/stoplist/kat.rb +383 -0
  34. data/lib/keyphrase/stoplist/khm.rb +245 -0
  35. data/lib/keyphrase/stoplist/kor.rb +610 -0
  36. data/lib/keyphrase/stoplist/lat.rb +14 -0
  37. data/lib/keyphrase/stoplist/lav.rb +171 -0
  38. data/lib/keyphrase/stoplist/lit.rb +484 -0
  39. data/lib/keyphrase/stoplist/mal.rb +11 -0
  40. data/lib/keyphrase/stoplist/mar.rb +109 -0
  41. data/lib/keyphrase/stoplist/mkd.rb +11 -0
  42. data/lib/keyphrase/stoplist/mya.rb +285 -0
  43. data/lib/keyphrase/stoplist/nep.rb +265 -0
  44. data/lib/keyphrase/stoplist/nld.rb +423 -0
  45. data/lib/keyphrase/stoplist/nob.rb +186 -0
  46. data/lib/keyphrase/stoplist/ori.rb +11 -0
  47. data/lib/keyphrase/stoplist/pan.rb +473 -0
  48. data/lib/keyphrase/stoplist/pes.rb +801 -0
  49. data/lib/keyphrase/stoplist/pol.rb +338 -0
  50. data/lib/keyphrase/stoplist/por.rb +570 -0
  51. data/lib/keyphrase/stoplist/ron.rb +444 -0
  52. data/lib/keyphrase/stoplist/rus.rb +569 -0
  53. data/lib/keyphrase/stoplist/sin.rb +10 -0
  54. data/lib/keyphrase/stoplist/slk.rb +428 -0
  55. data/lib/keyphrase/stoplist/slv.rb +456 -0
  56. data/lib/keyphrase/stoplist/sna.rb +11 -0
  57. data/lib/keyphrase/stoplist/spa.rb +731 -0
  58. data/lib/keyphrase/stoplist/srp.rb +11 -0
  59. data/lib/keyphrase/stoplist/swe.rb +428 -0
  60. data/lib/keyphrase/stoplist/tam.rb +135 -0
  61. data/lib/keyphrase/stoplist/tel.rb +10 -0
  62. data/lib/keyphrase/stoplist/tgl.rb +157 -0
  63. data/lib/keyphrase/stoplist/tha.rb +125 -0
  64. data/lib/keyphrase/stoplist/tuk.rb +11 -0
  65. data/lib/keyphrase/stoplist/tur.rb +514 -0
  66. data/lib/keyphrase/stoplist/ukr.rb +38 -0
  67. data/lib/keyphrase/stoplist/urd.rb +527 -0
  68. data/lib/keyphrase/stoplist/uzb.rb +10 -0
  69. data/lib/keyphrase/stoplist/vie.rb +655 -0
  70. data/lib/keyphrase/stoplist/yid.rb +204 -0
  71. data/lib/keyphrase/stoplist/zul.rb +39 -0
  72. data/lib/keyphrase/stoplist.rb +13 -10
  73. data/lib/keyphrase/version.rb +1 -1
  74. data/lib/keyphrase.rb +20 -12
  75. metadata +71 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 56611558acf8336a81d11dc0b6cd1168ed0c008f48822efed61741b2448f3a65
4
- data.tar.gz: 610725d8f12dbda7d041160bd98a976732f7a748644808e73afcc91f954c114d
3
+ metadata.gz: 6bd137b3873f8a008fb32e9ba2b857fb0fccb94c4f8c0c98dd1a4773a3588ec4
4
+ data.tar.gz: 26cd3c7ed4f030b0efb30dd721278974ea84085c7532124fe8f998d9e38513dc
5
5
  SHA512:
6
- metadata.gz: e996bfb9191c68a4df50b9ad52d4deaf553141c097d4e3172b9dae24246ef4098d47d8e67ef2beaa27acc6ef94e497ddb0a804307dd1d80b64cb740d9c8fb310
7
- data.tar.gz: 826ae9a9d3d3f1c1ffbcb381c7be7b3f6ed8410b89846494cd490e5ecc0dfddccb74f59088ef70c3f4b4e07dd5e9ca2b4a497d3d1ec805eb11a0179753ecda54
6
+ metadata.gz: 81d251b04a8cd2885e344e5eda291ae58ed50593d354bce153982a804d36e579509ff2c02a7f536721b282f24b1884b36b55df276006a6517d6192412319a908
7
+ data.tar.gz: 83bf1c033007cad120f504d8a9fb5eedeb496171889f42408872950186f20f2ceb8a5dc7f49f0b402dfe91f9b61559cd7488f603508628dd32726b121789f018
data/README.md CHANGED
@@ -25,18 +25,19 @@ require 'keyphrase'
25
25
  keyphrase = Keyphrase.new
26
26
  ```
27
27
 
28
- Use the Smart Stoplist:
28
+ Use a custom stopword list:
29
29
 
30
30
  ```
31
- keyphrase.analyse "your text", stoplist: Keyphrase.stopwords[:en]
31
+ keyphrase.analyse "your text", stopwords: %w{words to remove here}
32
32
  # → {"compatibility"=>1.0, "systems"=>1.0, "linear constraints"=>4.5, "set"=>2.0, "natural numbers"=>4.0, "criteria"=>1.0, "system"=>1.0, "linear diophantine equations"=>8.5, "strict inequations"=>4.0, "nonstrict inequations"=>4.0, "considered"=>1.5, "upper bounds"=>4.0, "components"=>1.0, "minimal set"=>4.666666666666666, "solutions"=>1.0, "algorithms"=>1.0, "construction"=>1.0, "minimal generating sets"=>8.666666666666666, "types"=>1.6666666666666667, "constructing"=>1.0, "minimal supporting set"=>7.666666666666666, "solving"=>1.0, "considered types"=>3.166666666666667, "mixed types"=>3.666666666666667}
33
33
  ```
34
34
 
35
- Use a custom stopword list:
35
+ Use a stopword list for a language:
36
36
 
37
37
  ```
38
- keyphrase.analyse "your text", ["custom","stopword","list"]
38
+ keyphrase.analyse "your text", lang: :kor
39
39
  ```
40
+ See `lib/keyphrase/stoplist` for all supported languages.
40
41
 
41
42
  Shorthand usage:
42
43
 
@@ -0,0 +1,14 @@
1
+ class Keyphrase
2
+ module Stoplist
3
+ class Afr
4
+ def self.stopwords
5
+ @@stopwords ||= %w{
6
+ 'n aan af al as baie by daar dag dat die dit een ek
7
+ en gaan gesê haar het hom hulle hy in is jou jy kan
8
+ kom ma maar met my na nie om ons op saam sal se sien
9
+ so sy te toe uit van vir was wat ʼn
10
+ }
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,10 @@
1
+ class Keyphrase
2
+ module Stoplist
3
+ class Aka
4
+ def self.stopwords
5
+ @@stopwords ||= %w{
6
+ }
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,10 @@
1
+ class Keyphrase
2
+ module Stoplist
3
+ class Amh
4
+ def self.stopwords
5
+ @@stopwords ||= [
6
+ ]
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,490 @@
1
+ class Keyphrase
2
+ module Stoplist
3
+ class Ara
4
+ def self.stopwords
5
+ @@stopwords ||= [
6
+ "،",
7
+ "آض",
8
+ "آمينَ",
9
+ "آه",
10
+ "آهاً",
11
+ "آي",
12
+ "أ",
13
+ "أب",
14
+ "أجل",
15
+ "أجمع",
16
+ "أخ",
17
+ "أخذ",
18
+ "أصبح",
19
+ "أضحى",
20
+ "أقبل",
21
+ "أقل",
22
+ "أكثر",
23
+ "ألا",
24
+ "أم",
25
+ "أما",
26
+ "أمامك",
27
+ "أمامكَ",
28
+ "أمسى",
29
+ "أمّا",
30
+ "أن",
31
+ "أنا",
32
+ "أنت",
33
+ "أنتم",
34
+ "أنتما",
35
+ "أنتن",
36
+ "أنتِ",
37
+ "أنشأ",
38
+ "أنّى",
39
+ "أو",
40
+ "أوشك",
41
+ "أولئك",
42
+ "أولئكم",
43
+ "أولاء",
44
+ "أولالك",
45
+ "أوّهْ",
46
+ "أي",
47
+ "أيا",
48
+ "أين",
49
+ "أينما",
50
+ "أيّ",
51
+ "أَنَّ",
52
+ "أََيُّ",
53
+ "أُفٍّ",
54
+ "إذ",
55
+ "إذا",
56
+ "إذاً",
57
+ "إذما",
58
+ "إذن",
59
+ "إلى",
60
+ "إليكم",
61
+ "إليكما",
62
+ "إليكنّ",
63
+ "إليكَ",
64
+ "إلَيْكَ",
65
+ "إلّا",
66
+ "إمّا",
67
+ "إن",
68
+ "إنّما",
69
+ "إي",
70
+ "إياك",
71
+ "إياكم",
72
+ "إياكما",
73
+ "إياكن",
74
+ "إيانا",
75
+ "إياه",
76
+ "إياها",
77
+ "إياهم",
78
+ "إياهما",
79
+ "إياهن",
80
+ "إياي",
81
+ "إيهٍ",
82
+ "إِنَّ",
83
+ "ا",
84
+ "ابتدأ",
85
+ "اثر",
86
+ "اجل",
87
+ "احد",
88
+ "اخرى",
89
+ "اخلولق",
90
+ "اذا",
91
+ "اربعة",
92
+ "ارتدّ",
93
+ "استحال",
94
+ "اطار",
95
+ "اعادة",
96
+ "اعلنت",
97
+ "اف",
98
+ "اكثر",
99
+ "اكد",
100
+ "الألاء",
101
+ "الألى",
102
+ "الا",
103
+ "الاخيرة",
104
+ "الان",
105
+ "الاول",
106
+ "الاولى",
107
+ "التى",
108
+ "التي",
109
+ "الثاني",
110
+ "الثانية",
111
+ "الذاتي",
112
+ "الذى",
113
+ "الذي",
114
+ "الذين",
115
+ "السابق",
116
+ "الف",
117
+ "اللائي",
118
+ "اللاتي",
119
+ "اللتان",
120
+ "اللتيا",
121
+ "اللتين",
122
+ "اللذان",
123
+ "اللذين",
124
+ "اللواتي",
125
+ "الماضي",
126
+ "المقبل",
127
+ "الوقت",
128
+ "الى",
129
+ "اليوم",
130
+ "اما",
131
+ "امام",
132
+ "امس",
133
+ "ان",
134
+ "انبرى",
135
+ "انقلب",
136
+ "انه",
137
+ "انها",
138
+ "او",
139
+ "اول",
140
+ "اي",
141
+ "ايار",
142
+ "ايام",
143
+ "ايضا",
144
+ "ب",
145
+ "بات",
146
+ "باسم",
147
+ "بان",
148
+ "بخٍ",
149
+ "برس",
150
+ "بسبب",
151
+ "بسّ",
152
+ "بشكل",
153
+ "بضع",
154
+ "بطآن",
155
+ "بعد",
156
+ "بعض",
157
+ "بك",
158
+ "بكم",
159
+ "بكما",
160
+ "بكن",
161
+ "بل",
162
+ "بلى",
163
+ "بما",
164
+ "بماذا",
165
+ "بمن",
166
+ "بن",
167
+ "بنا",
168
+ "به",
169
+ "بها",
170
+ "بي",
171
+ "بيد",
172
+ "بين",
173
+ "بَسْ",
174
+ "بَلْهَ",
175
+ "بِئْسَ",
176
+ "تانِ",
177
+ "تانِك",
178
+ "تبدّل",
179
+ "تجاه",
180
+ "تحوّل",
181
+ "تلقاء",
182
+ "تلك",
183
+ "تلكم",
184
+ "تلكما",
185
+ "تم",
186
+ "تينك",
187
+ "تَيْنِ",
188
+ "تِه",
189
+ "تِي",
190
+ "ثلاثة",
191
+ "ثم",
192
+ "ثمّ",
193
+ "ثمّة",
194
+ "ثُمَّ",
195
+ "جعل",
196
+ "جلل",
197
+ "جميع",
198
+ "جير",
199
+ "حار",
200
+ "حاشا",
201
+ "حاليا",
202
+ "حاي",
203
+ "حتى",
204
+ "حرى",
205
+ "حسب",
206
+ "حم",
207
+ "حوالى",
208
+ "حول",
209
+ "حيث",
210
+ "حيثما",
211
+ "حين",
212
+ "حيَّ",
213
+ "حَبَّذَا",
214
+ "حَتَّى",
215
+ "حَذارِ",
216
+ "خلا",
217
+ "خلال",
218
+ "دون",
219
+ "دونك",
220
+ "ذا",
221
+ "ذات",
222
+ "ذاك",
223
+ "ذانك",
224
+ "ذانِ",
225
+ "ذلك",
226
+ "ذلكم",
227
+ "ذلكما",
228
+ "ذلكن",
229
+ "ذو",
230
+ "ذوا",
231
+ "ذواتا",
232
+ "ذواتي",
233
+ "ذيت",
234
+ "ذينك",
235
+ "ذَيْنِ",
236
+ "ذِه",
237
+ "ذِي",
238
+ "راح",
239
+ "رجع",
240
+ "رويدك",
241
+ "ريث",
242
+ "رُبَّ",
243
+ "زيارة",
244
+ "سبحان",
245
+ "سرعان",
246
+ "سنة",
247
+ "سنوات",
248
+ "سوف",
249
+ "سوى",
250
+ "سَاءَ",
251
+ "سَاءَمَا",
252
+ "شبه",
253
+ "شخصا",
254
+ "شرع",
255
+ "شَتَّانَ",
256
+ "صار",
257
+ "صباح",
258
+ "صفر",
259
+ "صهٍ",
260
+ "صهْ",
261
+ "ضد",
262
+ "ضمن",
263
+ "طاق",
264
+ "طالما",
265
+ "طفق",
266
+ "طَق",
267
+ "ظلّ",
268
+ "عاد",
269
+ "عام",
270
+ "عاما",
271
+ "عامة",
272
+ "عدا",
273
+ "عدة",
274
+ "عدد",
275
+ "عدم",
276
+ "عسى",
277
+ "عشر",
278
+ "عشرة",
279
+ "علق",
280
+ "على",
281
+ "عليك",
282
+ "عليه",
283
+ "عليها",
284
+ "علًّ",
285
+ "عن",
286
+ "عند",
287
+ "عندما",
288
+ "عوض",
289
+ "عين",
290
+ "عَدَسْ",
291
+ "عَمَّا",
292
+ "غدا",
293
+ "غير",
294
+ "ـ",
295
+ "ف",
296
+ "فان",
297
+ "فلان",
298
+ "فو",
299
+ "فى",
300
+ "في",
301
+ "فيم",
302
+ "فيما",
303
+ "فيه",
304
+ "فيها",
305
+ "قال",
306
+ "قام",
307
+ "قبل",
308
+ "قد",
309
+ "قطّ",
310
+ "قلما",
311
+ "قوة",
312
+ "كأنّما",
313
+ "كأين",
314
+ "كأيّ",
315
+ "كأيّن",
316
+ "كاد",
317
+ "كان",
318
+ "كانت",
319
+ "كذا",
320
+ "كذلك",
321
+ "كرب",
322
+ "كل",
323
+ "كلا",
324
+ "كلاهما",
325
+ "كلتا",
326
+ "كلم",
327
+ "كليكما",
328
+ "كليهما",
329
+ "كلّما",
330
+ "كلَّا",
331
+ "كم",
332
+ "كما",
333
+ "كي",
334
+ "كيت",
335
+ "كيف",
336
+ "كيفما",
337
+ "كَأَنَّ",
338
+ "كِخ",
339
+ "لئن",
340
+ "لا",
341
+ "لات",
342
+ "لاسيما",
343
+ "لدن",
344
+ "لدى",
345
+ "لعمر",
346
+ "لقاء",
347
+ "لك",
348
+ "لكم",
349
+ "لكما",
350
+ "لكن",
351
+ "لكنَّما",
352
+ "لكي",
353
+ "لكيلا",
354
+ "للامم",
355
+ "لم",
356
+ "لما",
357
+ "لمّا",
358
+ "لن",
359
+ "لنا",
360
+ "له",
361
+ "لها",
362
+ "لو",
363
+ "لوكالة",
364
+ "لولا",
365
+ "لوما",
366
+ "لي",
367
+ "لَسْتَ",
368
+ "لَسْتُ",
369
+ "لَسْتُم",
370
+ "لَسْتُمَا",
371
+ "لَسْتُنَّ",
372
+ "لَسْتِ",
373
+ "لَسْنَ",
374
+ "لَعَلَّ",
375
+ "لَكِنَّ",
376
+ "لَيْتَ",
377
+ "لَيْسَ",
378
+ "لَيْسَا",
379
+ "لَيْسَتَا",
380
+ "لَيْسَتْ",
381
+ "لَيْسُوا",
382
+ "لَِسْنَا",
383
+ "ما",
384
+ "ماانفك",
385
+ "مابرح",
386
+ "مادام",
387
+ "ماذا",
388
+ "مازال",
389
+ "مافتئ",
390
+ "مايو",
391
+ "متى",
392
+ "مثل",
393
+ "مذ",
394
+ "مساء",
395
+ "مع",
396
+ "معاذ",
397
+ "مقابل",
398
+ "مكانكم",
399
+ "مكانكما",
400
+ "مكانكنّ",
401
+ "مكانَك",
402
+ "مليار",
403
+ "مليون",
404
+ "مما",
405
+ "ممن",
406
+ "من",
407
+ "منذ",
408
+ "منها",
409
+ "مه",
410
+ "مهما",
411
+ "مَنْ",
412
+ "مِن",
413
+ "نحن",
414
+ "نحو",
415
+ "نعم",
416
+ "نفس",
417
+ "نفسه",
418
+ "نهاية",
419
+ "نَخْ",
420
+ "نِعِمّا",
421
+ "نِعْمَ",
422
+ "ها",
423
+ "هاؤم",
424
+ "هاكَ",
425
+ "هاهنا",
426
+ "هبّ",
427
+ "هذا",
428
+ "هذه",
429
+ "هكذا",
430
+ "هل",
431
+ "هلمَّ",
432
+ "هلّا",
433
+ "هم",
434
+ "هما",
435
+ "هن",
436
+ "هنا",
437
+ "هناك",
438
+ "هنالك",
439
+ "هو",
440
+ "هي",
441
+ "هيا",
442
+ "هيت",
443
+ "هيّا",
444
+ "هَؤلاء",
445
+ "هَاتانِ",
446
+ "هَاتَيْنِ",
447
+ "هَاتِه",
448
+ "هَاتِي",
449
+ "هَجْ",
450
+ "هَذا",
451
+ "هَذانِ",
452
+ "هَذَيْنِ",
453
+ "هَذِه",
454
+ "هَذِي",
455
+ "هَيْهَاتَ",
456
+ "و",
457
+ "و6",
458
+ "وا",
459
+ "واحد",
460
+ "واضاف",
461
+ "واضافت",
462
+ "واكد",
463
+ "وان",
464
+ "واهاً",
465
+ "واوضح",
466
+ "وراءَك",
467
+ "وفي",
468
+ "وقال",
469
+ "وقالت",
470
+ "وقد",
471
+ "وقف",
472
+ "وكان",
473
+ "وكانت",
474
+ "ولا",
475
+ "ولم",
476
+ "ومن",
477
+ "وهو",
478
+ "وهي",
479
+ "ويكأنّ",
480
+ "وَيْ",
481
+ "وُشْكَانََ",
482
+ "يكون",
483
+ "يمكن",
484
+ "يوم",
485
+ "ّأيّان",
486
+ ]
487
+ end
488
+ end
489
+ end
490
+ end