keyphrase 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +5 -4
  3. data/lib/keyphrase/stoplist/afr.rb +14 -0
  4. data/lib/keyphrase/stoplist/aka.rb +10 -0
  5. data/lib/keyphrase/stoplist/amh.rb +10 -0
  6. data/lib/keyphrase/stoplist/ara.rb +490 -0
  7. data/lib/keyphrase/stoplist/aze.rb +175 -0
  8. data/lib/keyphrase/stoplist/bel.rb +11 -0
  9. data/lib/keyphrase/stoplist/ben.rb +408 -0
  10. data/lib/keyphrase/stoplist/bul.rb +528 -0
  11. data/lib/keyphrase/stoplist/cat.rb +711 -0
  12. data/lib/keyphrase/stoplist/ces.rb +560 -0
  13. data/lib/keyphrase/stoplist/cmn.rb +1119 -0
  14. data/lib/keyphrase/stoplist/dan.rb +25 -0
  15. data/lib/keyphrase/stoplist/deu.rb +631 -0
  16. data/lib/keyphrase/stoplist/ell.rb +275 -0
  17. data/lib/keyphrase/stoplist/eng.rb +2 -589
  18. data/lib/keyphrase/stoplist/epo.rb +183 -0
  19. data/lib/keyphrase/stoplist/est.rb +13 -0
  20. data/lib/keyphrase/stoplist/fin.rb +857 -0
  21. data/lib/keyphrase/stoplist/fra.rb +699 -0
  22. data/lib/keyphrase/stoplist/guj.rb +234 -0
  23. data/lib/keyphrase/stoplist/heb.rb +204 -0
  24. data/lib/keyphrase/stoplist/hin.rb +235 -0
  25. data/lib/keyphrase/stoplist/hrv.rb +25 -0
  26. data/lib/keyphrase/stoplist/hun.rb +1195 -0
  27. data/lib/keyphrase/stoplist/hye.rb +55 -0
  28. data/lib/keyphrase/stoplist/ind.rb +768 -0
  29. data/lib/keyphrase/stoplist/ita.rb +670 -0
  30. data/lib/keyphrase/stoplist/jav.rb +10 -0
  31. data/lib/keyphrase/stoplist/jpn.rb +144 -0
  32. data/lib/keyphrase/stoplist/kan.rb +92 -0
  33. data/lib/keyphrase/stoplist/kat.rb +383 -0
  34. data/lib/keyphrase/stoplist/khm.rb +245 -0
  35. data/lib/keyphrase/stoplist/kor.rb +610 -0
  36. data/lib/keyphrase/stoplist/lat.rb +14 -0
  37. data/lib/keyphrase/stoplist/lav.rb +171 -0
  38. data/lib/keyphrase/stoplist/lit.rb +484 -0
  39. data/lib/keyphrase/stoplist/mal.rb +11 -0
  40. data/lib/keyphrase/stoplist/mar.rb +109 -0
  41. data/lib/keyphrase/stoplist/mkd.rb +11 -0
  42. data/lib/keyphrase/stoplist/mya.rb +285 -0
  43. data/lib/keyphrase/stoplist/nep.rb +265 -0
  44. data/lib/keyphrase/stoplist/nld.rb +423 -0
  45. data/lib/keyphrase/stoplist/nob.rb +186 -0
  46. data/lib/keyphrase/stoplist/ori.rb +11 -0
  47. data/lib/keyphrase/stoplist/pan.rb +473 -0
  48. data/lib/keyphrase/stoplist/pes.rb +801 -0
  49. data/lib/keyphrase/stoplist/pol.rb +338 -0
  50. data/lib/keyphrase/stoplist/por.rb +570 -0
  51. data/lib/keyphrase/stoplist/ron.rb +444 -0
  52. data/lib/keyphrase/stoplist/rus.rb +569 -0
  53. data/lib/keyphrase/stoplist/sin.rb +10 -0
  54. data/lib/keyphrase/stoplist/slk.rb +428 -0
  55. data/lib/keyphrase/stoplist/slv.rb +456 -0
  56. data/lib/keyphrase/stoplist/sna.rb +11 -0
  57. data/lib/keyphrase/stoplist/spa.rb +731 -0
  58. data/lib/keyphrase/stoplist/srp.rb +11 -0
  59. data/lib/keyphrase/stoplist/swe.rb +428 -0
  60. data/lib/keyphrase/stoplist/tam.rb +135 -0
  61. data/lib/keyphrase/stoplist/tel.rb +10 -0
  62. data/lib/keyphrase/stoplist/tgl.rb +157 -0
  63. data/lib/keyphrase/stoplist/tha.rb +125 -0
  64. data/lib/keyphrase/stoplist/tuk.rb +11 -0
  65. data/lib/keyphrase/stoplist/tur.rb +514 -0
  66. data/lib/keyphrase/stoplist/ukr.rb +38 -0
  67. data/lib/keyphrase/stoplist/urd.rb +527 -0
  68. data/lib/keyphrase/stoplist/uzb.rb +10 -0
  69. data/lib/keyphrase/stoplist/vie.rb +655 -0
  70. data/lib/keyphrase/stoplist/yid.rb +204 -0
  71. data/lib/keyphrase/stoplist/zul.rb +39 -0
  72. data/lib/keyphrase/stoplist.rb +13 -10
  73. data/lib/keyphrase/version.rb +1 -1
  74. data/lib/keyphrase.rb +20 -12
  75. metadata +71 -3
@@ -0,0 +1,204 @@
1
+ class Keyphrase
2
+ module Stoplist
3
+ class Yid
4
+ def self.stopwords
5
+ @@stopwords ||= [
6
+ "אבל",
7
+ "או",
8
+ "אולי",
9
+ "אותה",
10
+ "אותו",
11
+ "אותי",
12
+ "אותך",
13
+ "אותם",
14
+ "אותן",
15
+ "אותנו",
16
+ "אז",
17
+ "אחר",
18
+ "אחרות",
19
+ "אחרי",
20
+ "אחריכן",
21
+ "אחרים",
22
+ "אחרת",
23
+ "אי",
24
+ "איזה",
25
+ "איך",
26
+ "אין",
27
+ "איפה",
28
+ "איתה",
29
+ "איתו",
30
+ "איתי",
31
+ "איתך",
32
+ "איתכם",
33
+ "איתכן",
34
+ "איתם",
35
+ "איתן",
36
+ "איתנו",
37
+ "אך",
38
+ "אל",
39
+ "אלה",
40
+ "אלו",
41
+ "אם",
42
+ "אנחנו",
43
+ "אני",
44
+ "אס",
45
+ "אף",
46
+ "אצל",
47
+ "אשר",
48
+ "את",
49
+ "אתה",
50
+ "אתכם",
51
+ "אתכן",
52
+ "אתם",
53
+ "אתן",
54
+ "באיזומידה",
55
+ "באמצע",
56
+ "באמצעות",
57
+ "בגלל",
58
+ "בין",
59
+ "בלי",
60
+ "במידה",
61
+ "במקוםשבו",
62
+ "ברם",
63
+ "בשביל",
64
+ "בשעהש",
65
+ "בתוך",
66
+ "גם",
67
+ "דרך",
68
+ "הוא",
69
+ "היא",
70
+ "היה",
71
+ "היכן",
72
+ "היתה",
73
+ "היתי",
74
+ "הם",
75
+ "הן",
76
+ "הנה",
77
+ "הסיבהשבגללה",
78
+ "הרי",
79
+ "ואילו",
80
+ "ואת",
81
+ "זאת",
82
+ "זה",
83
+ "זות",
84
+ "יהיה",
85
+ "יוכל",
86
+ "יוכלו",
87
+ "יותרמדי",
88
+ "יכול",
89
+ "יכולה",
90
+ "יכולות",
91
+ "יכולים",
92
+ "יכל",
93
+ "יכלה",
94
+ "יכלו",
95
+ "יש",
96
+ "כאן",
97
+ "כאשר",
98
+ "כולם",
99
+ "כולן",
100
+ "כזה",
101
+ "כי",
102
+ "כיצד",
103
+ "כך",
104
+ "ככה",
105
+ "כל",
106
+ "כלל",
107
+ "כמו",
108
+ "כן",
109
+ "כפי",
110
+ "כש",
111
+ "לא",
112
+ "לאו",
113
+ "לאיזותכלית",
114
+ "לאן",
115
+ "לבין",
116
+ "לה",
117
+ "להיות",
118
+ "להם",
119
+ "להן",
120
+ "לו",
121
+ "לי",
122
+ "לכם",
123
+ "לכן",
124
+ "למה",
125
+ "למטה",
126
+ "למעלה",
127
+ "למקוםשבו",
128
+ "למרות",
129
+ "לנו",
130
+ "לעבר",
131
+ "לעיכן",
132
+ "לפיכך",
133
+ "לפני",
134
+ "מאד",
135
+ "מאחורי",
136
+ "מאיזוסיבה",
137
+ "מאין",
138
+ "מאיפה",
139
+ "מבלי",
140
+ "מבעד",
141
+ "מדוע",
142
+ "מה",
143
+ "מהיכן",
144
+ "מול",
145
+ "מחוץ",
146
+ "מי",
147
+ "מכאן",
148
+ "מכיוון",
149
+ "מלבד",
150
+ "מן",
151
+ "מנין",
152
+ "מסוגל",
153
+ "מעט",
154
+ "מעטים",
155
+ "מעל",
156
+ "מצד",
157
+ "מקוםבו",
158
+ "מתחת",
159
+ "מתי",
160
+ "נגד",
161
+ "נגר",
162
+ "נו",
163
+ "עד",
164
+ "עז",
165
+ "על",
166
+ "עלי",
167
+ "עליה",
168
+ "עליהם",
169
+ "עליהן",
170
+ "עליו",
171
+ "עליך",
172
+ "עליכם",
173
+ "עלינו",
174
+ "עם",
175
+ "עצמה",
176
+ "עצמהם",
177
+ "עצמהן",
178
+ "עצמו",
179
+ "עצמי",
180
+ "עצמם",
181
+ "עצמן",
182
+ "עצמנו",
183
+ "פה",
184
+ "רק",
185
+ "שוב",
186
+ "של",
187
+ "שלה",
188
+ "שלהם",
189
+ "שלהן",
190
+ "שלו",
191
+ "שלי",
192
+ "שלך",
193
+ "שלכה",
194
+ "שלכם",
195
+ "שלכן",
196
+ "שלנו",
197
+ "שם",
198
+ "תהיה",
199
+ "תחת",
200
+ ]
201
+ end
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,39 @@
1
+ class Keyphrase
2
+ module Stoplist
3
+ class Zul
4
+ def self.stopwords
5
+ @@stopwords ||= [
6
+ "futhi",
7
+ "kahle",
8
+ "kakhulu",
9
+ "kanye",
10
+ "khona",
11
+ "kodwa",
12
+ "kungani",
13
+ "kusho",
14
+ "la",
15
+ "lakhe",
16
+ "lapho",
17
+ "mina",
18
+ "ngesikhathi",
19
+ "nje",
20
+ "phansi",
21
+ "phezulu",
22
+ "u",
23
+ "ukuba",
24
+ "ukuthi",
25
+ "ukuze",
26
+ "uma",
27
+ "wahamba",
28
+ "wakhe",
29
+ "wami",
30
+ "wase",
31
+ "wathi",
32
+ "yakhe",
33
+ "zakhe",
34
+ "zonke",
35
+ ]
36
+ end
37
+ end
38
+ end
39
+ end
@@ -1,21 +1,24 @@
1
1
  module Keyphrase::Stoplist
2
+ class << self
3
+ # Class variable to store filenames
4
+ @@file_names = []
5
+
6
+ # Method to retrieve the array of filenames
7
+ def languages
8
+ @@file_names
9
+ end
10
+ end
11
+
2
12
  # Dynamically require all files in the stoplist directory
3
13
  Dir[File.join(__dir__, 'stoplist', '*.rb')].each do |file|
4
14
  require_relative file
15
+ @@file_names << File.basename(file, '.rb').to_sym
5
16
  end
6
17
 
7
- def self.stopwords lang, type=:smart
18
+ def self.stopwords_for_lang lang
8
19
  cl = const_get(lang.to_s.capitalize)
9
20
 
10
- if type == :strict
11
- cl.strict
12
- else
13
- cl.smart
14
- end
15
- end
16
-
17
- def self.stoplist_classes
18
- constants.map { |const| }
21
+ cl.stopwords
19
22
  end
20
23
 
21
24
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Keyphrase
4
- VERSION = "0.1.3"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/keyphrase.rb CHANGED
@@ -6,8 +6,8 @@ class Keyphrase
6
6
 
7
7
  autoload :Stoplist, "keyphrase/stoplist"
8
8
 
9
- CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
10
- BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
9
+ CLEAN_REGEX = /([^\p{L}a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
10
+ BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\p{L}]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
11
11
  CLEAN_SPACES_REGEX = /\s+/
12
12
  SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
13
13
 
@@ -16,8 +16,12 @@ class Keyphrase
16
16
  @@keyphrase.analyse text, options
17
17
  end
18
18
 
19
+ def initialize
20
+ @cached_regex = {}
21
+ end
22
+
19
23
  def analyse text, options={}
20
- stoplist = options[:stoplist] || :smart
24
+ stopwords = options[:stopwords]
21
25
  lang = options[:lang] || :eng
22
26
  clean_regex = options[:clean] || CLEAN_REGEX
23
27
  position_bonus = options[:position_bonus] || true
@@ -26,7 +30,7 @@ class Keyphrase
26
30
  sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
27
31
  clean_spaces_regex = options[:clean_spaces_regex] || CLEAN_SPACES_REGEX
28
32
 
29
- pattern = buildStopwordRegExPattern stoplist, lang
33
+ pattern = buildStopwordRegExPattern lang, stopwords
30
34
  sentences = text.split sentences_regex
31
35
  phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist, clean_spaces_regex
32
36
  wordscores = calculateWordScores phrases
@@ -49,16 +53,20 @@ class Keyphrase
49
53
 
50
54
  # create stopword pattern
51
55
  # 1
52
- def buildStopwordRegExPattern stopwords, lang
53
-
54
- if stopwords.is_a? Symbol
55
- # use caching
56
- return Keyphrase::Stoplist.stopwords lang, stopwords
56
+ def buildStopwordRegExPattern lang, stopwords = nil
57
+ stopwords ||= Keyphrase::Stoplist.stopwords_for_lang lang
58
+
59
+ # Check if the regex for the given language and stopwords is already cached
60
+ if @cached_regex[lang].nil? || @cached_regex[lang][:stopwords] != stopwords
61
+ # If not cached or stopwords have changed, recompile the regex and store in the cache
62
+ @cached_regex[lang] = {
63
+ stopwords: stopwords,
64
+ regex: Regexp.new("(?:^|\\s)(?:#{stopwords.join('|')})(?:$|\\s)", Regexp::IGNORECASE | Regexp::MULTILINE)
65
+ }
57
66
  end
58
67
 
59
- stop_regex = /(?:^|\s)(?:#{stopwords.join('|')})(?:$|\s)/io
60
-
61
- return stop_regex
68
+ # Return the cached regex
69
+ @cached_regex[lang][:regex]
62
70
  end
63
71
 
64
72
  # generate candidate keywords
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: keyphrase
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben D'Angelo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-08 00:00:00.000000000 Z
11
+ date: 2023-12-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -38,7 +38,75 @@ files:
38
38
  - Rakefile
39
39
  - lib/keyphrase.rb
40
40
  - lib/keyphrase/stoplist.rb
41
+ - lib/keyphrase/stoplist/afr.rb
42
+ - lib/keyphrase/stoplist/aka.rb
43
+ - lib/keyphrase/stoplist/amh.rb
44
+ - lib/keyphrase/stoplist/ara.rb
45
+ - lib/keyphrase/stoplist/aze.rb
46
+ - lib/keyphrase/stoplist/bel.rb
47
+ - lib/keyphrase/stoplist/ben.rb
48
+ - lib/keyphrase/stoplist/bul.rb
49
+ - lib/keyphrase/stoplist/cat.rb
50
+ - lib/keyphrase/stoplist/ces.rb
51
+ - lib/keyphrase/stoplist/cmn.rb
52
+ - lib/keyphrase/stoplist/dan.rb
53
+ - lib/keyphrase/stoplist/deu.rb
54
+ - lib/keyphrase/stoplist/ell.rb
41
55
  - lib/keyphrase/stoplist/eng.rb
56
+ - lib/keyphrase/stoplist/epo.rb
57
+ - lib/keyphrase/stoplist/est.rb
58
+ - lib/keyphrase/stoplist/fin.rb
59
+ - lib/keyphrase/stoplist/fra.rb
60
+ - lib/keyphrase/stoplist/guj.rb
61
+ - lib/keyphrase/stoplist/heb.rb
62
+ - lib/keyphrase/stoplist/hin.rb
63
+ - lib/keyphrase/stoplist/hrv.rb
64
+ - lib/keyphrase/stoplist/hun.rb
65
+ - lib/keyphrase/stoplist/hye.rb
66
+ - lib/keyphrase/stoplist/ind.rb
67
+ - lib/keyphrase/stoplist/ita.rb
68
+ - lib/keyphrase/stoplist/jav.rb
69
+ - lib/keyphrase/stoplist/jpn.rb
70
+ - lib/keyphrase/stoplist/kan.rb
71
+ - lib/keyphrase/stoplist/kat.rb
72
+ - lib/keyphrase/stoplist/khm.rb
73
+ - lib/keyphrase/stoplist/kor.rb
74
+ - lib/keyphrase/stoplist/lat.rb
75
+ - lib/keyphrase/stoplist/lav.rb
76
+ - lib/keyphrase/stoplist/lit.rb
77
+ - lib/keyphrase/stoplist/mal.rb
78
+ - lib/keyphrase/stoplist/mar.rb
79
+ - lib/keyphrase/stoplist/mkd.rb
80
+ - lib/keyphrase/stoplist/mya.rb
81
+ - lib/keyphrase/stoplist/nep.rb
82
+ - lib/keyphrase/stoplist/nld.rb
83
+ - lib/keyphrase/stoplist/nob.rb
84
+ - lib/keyphrase/stoplist/ori.rb
85
+ - lib/keyphrase/stoplist/pan.rb
86
+ - lib/keyphrase/stoplist/pes.rb
87
+ - lib/keyphrase/stoplist/pol.rb
88
+ - lib/keyphrase/stoplist/por.rb
89
+ - lib/keyphrase/stoplist/ron.rb
90
+ - lib/keyphrase/stoplist/rus.rb
91
+ - lib/keyphrase/stoplist/sin.rb
92
+ - lib/keyphrase/stoplist/slk.rb
93
+ - lib/keyphrase/stoplist/slv.rb
94
+ - lib/keyphrase/stoplist/sna.rb
95
+ - lib/keyphrase/stoplist/spa.rb
96
+ - lib/keyphrase/stoplist/srp.rb
97
+ - lib/keyphrase/stoplist/swe.rb
98
+ - lib/keyphrase/stoplist/tam.rb
99
+ - lib/keyphrase/stoplist/tel.rb
100
+ - lib/keyphrase/stoplist/tgl.rb
101
+ - lib/keyphrase/stoplist/tha.rb
102
+ - lib/keyphrase/stoplist/tuk.rb
103
+ - lib/keyphrase/stoplist/tur.rb
104
+ - lib/keyphrase/stoplist/ukr.rb
105
+ - lib/keyphrase/stoplist/urd.rb
106
+ - lib/keyphrase/stoplist/uzb.rb
107
+ - lib/keyphrase/stoplist/vie.rb
108
+ - lib/keyphrase/stoplist/yid.rb
109
+ - lib/keyphrase/stoplist/zul.rb
42
110
  - lib/keyphrase/version.rb
43
111
  - sig/keyphrase.rbs
44
112
  homepage: https://github.com/bendangelo/keyphrase
@@ -62,7 +130,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
62
130
  - !ruby/object:Gem::Version
63
131
  version: '0'
64
132
  requirements: []
65
- rubygems_version: 3.4.22
133
+ rubygems_version: 3.5.3
66
134
  signing_key:
67
135
  specification_version: 4
68
136
  summary: Extracts keywords from texts using a stoplist and some magic.