keyphrase 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +5 -4
  3. data/lib/keyphrase/stoplist/afr.rb +14 -0
  4. data/lib/keyphrase/stoplist/aka.rb +10 -0
  5. data/lib/keyphrase/stoplist/amh.rb +10 -0
  6. data/lib/keyphrase/stoplist/ara.rb +490 -0
  7. data/lib/keyphrase/stoplist/aze.rb +175 -0
  8. data/lib/keyphrase/stoplist/bel.rb +11 -0
  9. data/lib/keyphrase/stoplist/ben.rb +408 -0
  10. data/lib/keyphrase/stoplist/bul.rb +528 -0
  11. data/lib/keyphrase/stoplist/cat.rb +711 -0
  12. data/lib/keyphrase/stoplist/ces.rb +560 -0
  13. data/lib/keyphrase/stoplist/cmn.rb +1119 -0
  14. data/lib/keyphrase/stoplist/dan.rb +25 -0
  15. data/lib/keyphrase/stoplist/deu.rb +631 -0
  16. data/lib/keyphrase/stoplist/ell.rb +275 -0
  17. data/lib/keyphrase/stoplist/eng.rb +2 -589
  18. data/lib/keyphrase/stoplist/epo.rb +183 -0
  19. data/lib/keyphrase/stoplist/est.rb +13 -0
  20. data/lib/keyphrase/stoplist/fin.rb +857 -0
  21. data/lib/keyphrase/stoplist/fra.rb +699 -0
  22. data/lib/keyphrase/stoplist/guj.rb +234 -0
  23. data/lib/keyphrase/stoplist/heb.rb +204 -0
  24. data/lib/keyphrase/stoplist/hin.rb +235 -0
  25. data/lib/keyphrase/stoplist/hrv.rb +25 -0
  26. data/lib/keyphrase/stoplist/hun.rb +1195 -0
  27. data/lib/keyphrase/stoplist/hye.rb +55 -0
  28. data/lib/keyphrase/stoplist/ind.rb +768 -0
  29. data/lib/keyphrase/stoplist/ita.rb +670 -0
  30. data/lib/keyphrase/stoplist/jav.rb +10 -0
  31. data/lib/keyphrase/stoplist/jpn.rb +144 -0
  32. data/lib/keyphrase/stoplist/kan.rb +92 -0
  33. data/lib/keyphrase/stoplist/kat.rb +383 -0
  34. data/lib/keyphrase/stoplist/khm.rb +245 -0
  35. data/lib/keyphrase/stoplist/kor.rb +610 -0
  36. data/lib/keyphrase/stoplist/lat.rb +14 -0
  37. data/lib/keyphrase/stoplist/lav.rb +171 -0
  38. data/lib/keyphrase/stoplist/lit.rb +484 -0
  39. data/lib/keyphrase/stoplist/mal.rb +11 -0
  40. data/lib/keyphrase/stoplist/mar.rb +109 -0
  41. data/lib/keyphrase/stoplist/mkd.rb +11 -0
  42. data/lib/keyphrase/stoplist/mya.rb +285 -0
  43. data/lib/keyphrase/stoplist/nep.rb +265 -0
  44. data/lib/keyphrase/stoplist/nld.rb +423 -0
  45. data/lib/keyphrase/stoplist/nob.rb +186 -0
  46. data/lib/keyphrase/stoplist/ori.rb +11 -0
  47. data/lib/keyphrase/stoplist/pan.rb +473 -0
  48. data/lib/keyphrase/stoplist/pes.rb +801 -0
  49. data/lib/keyphrase/stoplist/pol.rb +338 -0
  50. data/lib/keyphrase/stoplist/por.rb +570 -0
  51. data/lib/keyphrase/stoplist/ron.rb +444 -0
  52. data/lib/keyphrase/stoplist/rus.rb +569 -0
  53. data/lib/keyphrase/stoplist/sin.rb +10 -0
  54. data/lib/keyphrase/stoplist/slk.rb +428 -0
  55. data/lib/keyphrase/stoplist/slv.rb +456 -0
  56. data/lib/keyphrase/stoplist/sna.rb +11 -0
  57. data/lib/keyphrase/stoplist/spa.rb +731 -0
  58. data/lib/keyphrase/stoplist/srp.rb +11 -0
  59. data/lib/keyphrase/stoplist/swe.rb +428 -0
  60. data/lib/keyphrase/stoplist/tam.rb +135 -0
  61. data/lib/keyphrase/stoplist/tel.rb +10 -0
  62. data/lib/keyphrase/stoplist/tgl.rb +157 -0
  63. data/lib/keyphrase/stoplist/tha.rb +125 -0
  64. data/lib/keyphrase/stoplist/tuk.rb +11 -0
  65. data/lib/keyphrase/stoplist/tur.rb +514 -0
  66. data/lib/keyphrase/stoplist/ukr.rb +38 -0
  67. data/lib/keyphrase/stoplist/urd.rb +527 -0
  68. data/lib/keyphrase/stoplist/uzb.rb +10 -0
  69. data/lib/keyphrase/stoplist/vie.rb +655 -0
  70. data/lib/keyphrase/stoplist/yid.rb +204 -0
  71. data/lib/keyphrase/stoplist/zul.rb +39 -0
  72. data/lib/keyphrase/stoplist.rb +13 -10
  73. data/lib/keyphrase/version.rb +1 -1
  74. data/lib/keyphrase.rb +20 -12
  75. metadata +71 -3
@@ -0,0 +1,204 @@
1
+ class Keyphrase
2
+ module Stoplist
3
+ class Yid
4
+ def self.stopwords
5
+ @@stopwords ||= [
6
+ "אבל",
7
+ "או",
8
+ "אולי",
9
+ "אותה",
10
+ "אותו",
11
+ "אותי",
12
+ "אותך",
13
+ "אותם",
14
+ "אותן",
15
+ "אותנו",
16
+ "אז",
17
+ "אחר",
18
+ "אחרות",
19
+ "אחרי",
20
+ "אחריכן",
21
+ "אחרים",
22
+ "אחרת",
23
+ "אי",
24
+ "איזה",
25
+ "איך",
26
+ "אין",
27
+ "איפה",
28
+ "איתה",
29
+ "איתו",
30
+ "איתי",
31
+ "איתך",
32
+ "איתכם",
33
+ "איתכן",
34
+ "איתם",
35
+ "איתן",
36
+ "איתנו",
37
+ "אך",
38
+ "אל",
39
+ "אלה",
40
+ "אלו",
41
+ "אם",
42
+ "אנחנו",
43
+ "אני",
44
+ "אס",
45
+ "אף",
46
+ "אצל",
47
+ "אשר",
48
+ "את",
49
+ "אתה",
50
+ "אתכם",
51
+ "אתכן",
52
+ "אתם",
53
+ "אתן",
54
+ "באיזומידה",
55
+ "באמצע",
56
+ "באמצעות",
57
+ "בגלל",
58
+ "בין",
59
+ "בלי",
60
+ "במידה",
61
+ "במקוםשבו",
62
+ "ברם",
63
+ "בשביל",
64
+ "בשעהש",
65
+ "בתוך",
66
+ "גם",
67
+ "דרך",
68
+ "הוא",
69
+ "היא",
70
+ "היה",
71
+ "היכן",
72
+ "היתה",
73
+ "היתי",
74
+ "הם",
75
+ "הן",
76
+ "הנה",
77
+ "הסיבהשבגללה",
78
+ "הרי",
79
+ "ואילו",
80
+ "ואת",
81
+ "זאת",
82
+ "זה",
83
+ "זות",
84
+ "יהיה",
85
+ "יוכל",
86
+ "יוכלו",
87
+ "יותרמדי",
88
+ "יכול",
89
+ "יכולה",
90
+ "יכולות",
91
+ "יכולים",
92
+ "יכל",
93
+ "יכלה",
94
+ "יכלו",
95
+ "יש",
96
+ "כאן",
97
+ "כאשר",
98
+ "כולם",
99
+ "כולן",
100
+ "כזה",
101
+ "כי",
102
+ "כיצד",
103
+ "כך",
104
+ "ככה",
105
+ "כל",
106
+ "כלל",
107
+ "כמו",
108
+ "כן",
109
+ "כפי",
110
+ "כש",
111
+ "לא",
112
+ "לאו",
113
+ "לאיזותכלית",
114
+ "לאן",
115
+ "לבין",
116
+ "לה",
117
+ "להיות",
118
+ "להם",
119
+ "להן",
120
+ "לו",
121
+ "לי",
122
+ "לכם",
123
+ "לכן",
124
+ "למה",
125
+ "למטה",
126
+ "למעלה",
127
+ "למקוםשבו",
128
+ "למרות",
129
+ "לנו",
130
+ "לעבר",
131
+ "לעיכן",
132
+ "לפיכך",
133
+ "לפני",
134
+ "מאד",
135
+ "מאחורי",
136
+ "מאיזוסיבה",
137
+ "מאין",
138
+ "מאיפה",
139
+ "מבלי",
140
+ "מבעד",
141
+ "מדוע",
142
+ "מה",
143
+ "מהיכן",
144
+ "מול",
145
+ "מחוץ",
146
+ "מי",
147
+ "מכאן",
148
+ "מכיוון",
149
+ "מלבד",
150
+ "מן",
151
+ "מנין",
152
+ "מסוגל",
153
+ "מעט",
154
+ "מעטים",
155
+ "מעל",
156
+ "מצד",
157
+ "מקוםבו",
158
+ "מתחת",
159
+ "מתי",
160
+ "נגד",
161
+ "נגר",
162
+ "נו",
163
+ "עד",
164
+ "עז",
165
+ "על",
166
+ "עלי",
167
+ "עליה",
168
+ "עליהם",
169
+ "עליהן",
170
+ "עליו",
171
+ "עליך",
172
+ "עליכם",
173
+ "עלינו",
174
+ "עם",
175
+ "עצמה",
176
+ "עצמהם",
177
+ "עצמהן",
178
+ "עצמו",
179
+ "עצמי",
180
+ "עצמם",
181
+ "עצמן",
182
+ "עצמנו",
183
+ "פה",
184
+ "רק",
185
+ "שוב",
186
+ "של",
187
+ "שלה",
188
+ "שלהם",
189
+ "שלהן",
190
+ "שלו",
191
+ "שלי",
192
+ "שלך",
193
+ "שלכה",
194
+ "שלכם",
195
+ "שלכן",
196
+ "שלנו",
197
+ "שם",
198
+ "תהיה",
199
+ "תחת",
200
+ ]
201
+ end
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,39 @@
1
+ class Keyphrase
2
+ module Stoplist
3
+ class Zul
4
+ def self.stopwords
5
+ @@stopwords ||= [
6
+ "futhi",
7
+ "kahle",
8
+ "kakhulu",
9
+ "kanye",
10
+ "khona",
11
+ "kodwa",
12
+ "kungani",
13
+ "kusho",
14
+ "la",
15
+ "lakhe",
16
+ "lapho",
17
+ "mina",
18
+ "ngesikhathi",
19
+ "nje",
20
+ "phansi",
21
+ "phezulu",
22
+ "u",
23
+ "ukuba",
24
+ "ukuthi",
25
+ "ukuze",
26
+ "uma",
27
+ "wahamba",
28
+ "wakhe",
29
+ "wami",
30
+ "wase",
31
+ "wathi",
32
+ "yakhe",
33
+ "zakhe",
34
+ "zonke",
35
+ ]
36
+ end
37
+ end
38
+ end
39
+ end
@@ -1,21 +1,24 @@
1
1
  module Keyphrase::Stoplist
2
+ class << self
3
+ # Class variable to store filenames
4
+ @@file_names = []
5
+
6
+ # Method to retrieve the array of filenames
7
+ def languages
8
+ @@file_names
9
+ end
10
+ end
11
+
2
12
  # Dynamically require all files in the stoplist directory
3
13
  Dir[File.join(__dir__, 'stoplist', '*.rb')].each do |file|
4
14
  require_relative file
15
+ @@file_names << File.basename(file, '.rb').to_sym
5
16
  end
6
17
 
7
- def self.stopwords lang, type=:smart
18
+ def self.stopwords_for_lang lang
8
19
  cl = const_get(lang.to_s.capitalize)
9
20
 
10
- if type == :strict
11
- cl.strict
12
- else
13
- cl.smart
14
- end
15
- end
16
-
17
- def self.stoplist_classes
18
- constants.map { |const| }
21
+ cl.stopwords
19
22
  end
20
23
 
21
24
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Keyphrase
4
- VERSION = "0.1.3"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/keyphrase.rb CHANGED
@@ -6,8 +6,8 @@ class Keyphrase
6
6
 
7
7
  autoload :Stoplist, "keyphrase/stoplist"
8
8
 
9
- CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
10
- BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
9
+ CLEAN_REGEX = /([^\p{L}a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
10
+ BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\p{L}]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
11
11
  CLEAN_SPACES_REGEX = /\s+/
12
12
  SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
13
13
 
@@ -16,8 +16,12 @@ class Keyphrase
16
16
  @@keyphrase.analyse text, options
17
17
  end
18
18
 
19
+ def initialize
20
+ @cached_regex = {}
21
+ end
22
+
19
23
  def analyse text, options={}
20
- stoplist = options[:stoplist] || :smart
24
+ stopwords = options[:stopwords]
21
25
  lang = options[:lang] || :eng
22
26
  clean_regex = options[:clean] || CLEAN_REGEX
23
27
  position_bonus = options[:position_bonus] || true
@@ -26,7 +30,7 @@ class Keyphrase
26
30
  sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
27
31
  clean_spaces_regex = options[:clean_spaces_regex] || CLEAN_SPACES_REGEX
28
32
 
29
- pattern = buildStopwordRegExPattern stoplist, lang
33
+ pattern = buildStopwordRegExPattern lang, stopwords
30
34
  sentences = text.split sentences_regex
31
35
  phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist, clean_spaces_regex
32
36
  wordscores = calculateWordScores phrases
@@ -49,16 +53,20 @@ class Keyphrase
49
53
 
50
54
  # create stopword pattern
51
55
  # 1
52
- def buildStopwordRegExPattern stopwords, lang
53
-
54
- if stopwords.is_a? Symbol
55
- # use caching
56
- return Keyphrase::Stoplist.stopwords lang, stopwords
56
+ def buildStopwordRegExPattern lang, stopwords = nil
57
+ stopwords ||= Keyphrase::Stoplist.stopwords_for_lang lang
58
+
59
+ # Check if the regex for the given language and stopwords is already cached
60
+ if @cached_regex[lang].nil? || @cached_regex[lang][:stopwords] != stopwords
61
+ # If not cached or stopwords have changed, recompile the regex and store in the cache
62
+ @cached_regex[lang] = {
63
+ stopwords: stopwords,
64
+ regex: Regexp.new("(?:^|\\s)(?:#{stopwords.join('|')})(?:$|\\s)", Regexp::IGNORECASE | Regexp::MULTILINE)
65
+ }
57
66
  end
58
67
 
59
- stop_regex = /(?:^|\s)(?:#{stopwords.join('|')})(?:$|\s)/io
60
-
61
- return stop_regex
68
+ # Return the cached regex
69
+ @cached_regex[lang][:regex]
62
70
  end
63
71
 
64
72
  # generate candidate keywords
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: keyphrase
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben D'Angelo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-08 00:00:00.000000000 Z
11
+ date: 2023-12-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -38,7 +38,75 @@ files:
38
38
  - Rakefile
39
39
  - lib/keyphrase.rb
40
40
  - lib/keyphrase/stoplist.rb
41
+ - lib/keyphrase/stoplist/afr.rb
42
+ - lib/keyphrase/stoplist/aka.rb
43
+ - lib/keyphrase/stoplist/amh.rb
44
+ - lib/keyphrase/stoplist/ara.rb
45
+ - lib/keyphrase/stoplist/aze.rb
46
+ - lib/keyphrase/stoplist/bel.rb
47
+ - lib/keyphrase/stoplist/ben.rb
48
+ - lib/keyphrase/stoplist/bul.rb
49
+ - lib/keyphrase/stoplist/cat.rb
50
+ - lib/keyphrase/stoplist/ces.rb
51
+ - lib/keyphrase/stoplist/cmn.rb
52
+ - lib/keyphrase/stoplist/dan.rb
53
+ - lib/keyphrase/stoplist/deu.rb
54
+ - lib/keyphrase/stoplist/ell.rb
41
55
  - lib/keyphrase/stoplist/eng.rb
56
+ - lib/keyphrase/stoplist/epo.rb
57
+ - lib/keyphrase/stoplist/est.rb
58
+ - lib/keyphrase/stoplist/fin.rb
59
+ - lib/keyphrase/stoplist/fra.rb
60
+ - lib/keyphrase/stoplist/guj.rb
61
+ - lib/keyphrase/stoplist/heb.rb
62
+ - lib/keyphrase/stoplist/hin.rb
63
+ - lib/keyphrase/stoplist/hrv.rb
64
+ - lib/keyphrase/stoplist/hun.rb
65
+ - lib/keyphrase/stoplist/hye.rb
66
+ - lib/keyphrase/stoplist/ind.rb
67
+ - lib/keyphrase/stoplist/ita.rb
68
+ - lib/keyphrase/stoplist/jav.rb
69
+ - lib/keyphrase/stoplist/jpn.rb
70
+ - lib/keyphrase/stoplist/kan.rb
71
+ - lib/keyphrase/stoplist/kat.rb
72
+ - lib/keyphrase/stoplist/khm.rb
73
+ - lib/keyphrase/stoplist/kor.rb
74
+ - lib/keyphrase/stoplist/lat.rb
75
+ - lib/keyphrase/stoplist/lav.rb
76
+ - lib/keyphrase/stoplist/lit.rb
77
+ - lib/keyphrase/stoplist/mal.rb
78
+ - lib/keyphrase/stoplist/mar.rb
79
+ - lib/keyphrase/stoplist/mkd.rb
80
+ - lib/keyphrase/stoplist/mya.rb
81
+ - lib/keyphrase/stoplist/nep.rb
82
+ - lib/keyphrase/stoplist/nld.rb
83
+ - lib/keyphrase/stoplist/nob.rb
84
+ - lib/keyphrase/stoplist/ori.rb
85
+ - lib/keyphrase/stoplist/pan.rb
86
+ - lib/keyphrase/stoplist/pes.rb
87
+ - lib/keyphrase/stoplist/pol.rb
88
+ - lib/keyphrase/stoplist/por.rb
89
+ - lib/keyphrase/stoplist/ron.rb
90
+ - lib/keyphrase/stoplist/rus.rb
91
+ - lib/keyphrase/stoplist/sin.rb
92
+ - lib/keyphrase/stoplist/slk.rb
93
+ - lib/keyphrase/stoplist/slv.rb
94
+ - lib/keyphrase/stoplist/sna.rb
95
+ - lib/keyphrase/stoplist/spa.rb
96
+ - lib/keyphrase/stoplist/srp.rb
97
+ - lib/keyphrase/stoplist/swe.rb
98
+ - lib/keyphrase/stoplist/tam.rb
99
+ - lib/keyphrase/stoplist/tel.rb
100
+ - lib/keyphrase/stoplist/tgl.rb
101
+ - lib/keyphrase/stoplist/tha.rb
102
+ - lib/keyphrase/stoplist/tuk.rb
103
+ - lib/keyphrase/stoplist/tur.rb
104
+ - lib/keyphrase/stoplist/ukr.rb
105
+ - lib/keyphrase/stoplist/urd.rb
106
+ - lib/keyphrase/stoplist/uzb.rb
107
+ - lib/keyphrase/stoplist/vie.rb
108
+ - lib/keyphrase/stoplist/yid.rb
109
+ - lib/keyphrase/stoplist/zul.rb
42
110
  - lib/keyphrase/version.rb
43
111
  - sig/keyphrase.rbs
44
112
  homepage: https://github.com/bendangelo/keyphrase
@@ -62,7 +130,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
62
130
  - !ruby/object:Gem::Version
63
131
  version: '0'
64
132
  requirements: []
65
- rubygems_version: 3.4.22
133
+ rubygems_version: 3.5.3
66
134
  signing_key:
67
135
  specification_version: 4
68
136
  summary: Extracts keywords from texts using a stoplist and some magic.