keyword-ruby 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 433764acb546b2b35c3e05f56095ec7ba4642a8c67591d9466fb1a6a8ae47bdf
4
+ data.tar.gz: e83e1a3e1b454b584efccf44a592c7868e788ff756e946e4c7631e696982cc09
5
+ SHA512:
6
+ metadata.gz: b5dbf9ec6be9a3b3b657c03b6397df41de8152a02ab8539f5b62c435de8dd51ba80ca6d16645f50971d5bb48e9c2e1351a9e63081d61f89251c2d257f23cef54
7
+ data.tar.gz: 0d51a0fe597f68dff3231881ac131d69b85ee210bf638cdb95d8e8e8d1b2cdfab4f4e50c8f15d4c6271e394d647e5f4778e8a104270ea17a851c70d88cafc942
data/CHANGELOG.md ADDED
@@ -0,0 +1,11 @@
1
+ # Changelog
2
+
3
+ ## 0.1.0 (2026-03-09)
4
+
5
+ - Initial release
6
+ - RAKE keyword extraction algorithm
7
+ - YAKE keyword extraction algorithm
8
+ - TF-IDF keyword extraction algorithm
9
+ - English and Indonesian stop word lists
10
+ - Module-level `extract` and `extract_batch` API
11
+ - Configuration DSL
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Johannes Dwi Cahyo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,34 @@
1
+ # keyword-ruby
2
+
3
+ Pure Ruby keyword and keyphrase extraction using RAKE, YAKE, and TF-IDF algorithms.
4
+
5
+ ## Installation
6
+
7
+ ```ruby
8
+ gem "keyword-ruby", "~> 0.1"
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```ruby
14
+ require "keyword_ruby"
15
+
16
+ # RAKE (default)
17
+ keywords = KeywordRuby.extract("Your text here...")
18
+ keywords.each { |kw| puts "#{kw.phrase}: #{kw.score}" }
19
+
20
+ # YAKE
21
+ keywords = KeywordRuby.extract(text, algorithm: :yake)
22
+
23
+ # TF-IDF (with corpus)
24
+ extractor = KeywordRuby::Extractors::Tfidf.new
25
+ extractor.fit(corpus_documents)
26
+ keywords = extractor.extract(text)
27
+
28
+ # Batch extraction
29
+ results = KeywordRuby.extract_batch(documents, algorithm: :rake, top_n: 5)
30
+ ```
31
+
32
+ ## License
33
+
34
+ MIT
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rake/testtask"
4
+
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << "test"
7
+ t.libs << "lib"
8
+ t.test_files = FileList["test/**/test_*.rb"]
9
+ end
10
+
11
+ task default: :test
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/keyword_ruby/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "keyword-ruby"
7
+ spec.version = KeywordRuby::VERSION
8
+ spec.authors = ["Johannes Dwi Cahyo"]
9
+ spec.email = ["johannes@example.com"]
10
+ spec.summary = "Keyword extraction for Ruby using RAKE, YAKE, and TF-IDF"
11
+ spec.description = "Pure Ruby keyword and keyphrase extraction library. Implements RAKE, YAKE, and TF-IDF algorithms for extracting keywords from text."
12
+ spec.homepage = "https://github.com/johannesdwicahyo/keyword-ruby"
13
+ spec.license = "MIT"
14
+ spec.required_ruby_version = ">= 3.0.0"
15
+
16
+ spec.metadata["homepage_uri"] = spec.homepage
17
+ spec.metadata["source_code_uri"] = spec.homepage
18
+ spec.metadata["changelog_uri"] = "#{spec.homepage}/blob/main/CHANGELOG.md"
19
+
20
+ spec.files = Dir[
21
+ "lib/**/*.rb",
22
+ "lib/**/*.txt",
23
+ "README.md",
24
+ "LICENSE",
25
+ "CHANGELOG.md",
26
+ "Rakefile",
27
+ "keyword-ruby.gemspec"
28
+ ]
29
+ spec.require_paths = ["lib"]
30
+
31
+ spec.add_development_dependency "minitest", "~> 5.0"
32
+ spec.add_development_dependency "rake", "~> 13.0"
33
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module KeywordRuby
4
+ class Configuration
5
+ attr_accessor :default_algorithm, :default_language, :default_top_n,
6
+ :max_phrase_length, :min_word_length
7
+
8
+ def initialize
9
+ @default_algorithm = :rake
10
+ @default_language = :en
11
+ @default_top_n = 10
12
+ @max_phrase_length = 4
13
+ @min_word_length = 2
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ module KeywordRuby
4
+ module Extractors
5
+ class Base
6
+ SUPPORTED_LANGUAGES = %i[en id].freeze
7
+
8
+ def initialize(language: nil, top_n: nil, max_length: nil, min_word_length: nil, normalize: true)
9
+ config = KeywordRuby.configuration
10
+ @language = language || config.default_language
11
+ @top_n = top_n || config.default_top_n
12
+ @max_length = max_length || config.max_phrase_length
13
+ @min_word_length = min_word_length || config.min_word_length
14
+ @normalize = normalize
15
+
16
+ validate_params!
17
+
18
+ @stop_words = TextProcessing::StopWords.new(language: @language)
19
+ end
20
+
21
+ def extract(text)
22
+ raise NotImplementedError, "#{self.class}#extract not implemented"
23
+ end
24
+
25
+ private
26
+
27
+ def validate_params!
28
+ unless SUPPORTED_LANGUAGES.include?(@language)
29
+ raise ArgumentError, "Unsupported language: #{@language.inspect}. Supported: #{SUPPORTED_LANGUAGES.join(', ')}"
30
+ end
31
+
32
+ unless @top_n.is_a?(Integer) && @top_n > 0
33
+ raise ArgumentError, "top_n must be a positive integer, got: #{@top_n.inspect}"
34
+ end
35
+ end
36
+
37
+ def validate_text!(text)
38
+ raise ArgumentError, "text must be a String, got #{text.class}" unless text.is_a?(String) || text.nil?
39
+ end
40
+
41
+ def normalize_scores(keywords)
42
+ return keywords if keywords.empty? || !@normalize
43
+
44
+ scores = keywords.map(&:score)
45
+ min = scores.min
46
+ max = scores.max
47
+ range = max - min
48
+
49
+ if range.zero?
50
+ # All scores are equal -- normalize to 1.0
51
+ keywords.map { |kw| Keyword.new(phrase: kw.phrase, score: 1.0, position: kw.position) }
52
+ else
53
+ keywords.map do |kw|
54
+ normalized = (kw.score - min) / range
55
+ Keyword.new(phrase: kw.phrase, score: normalized, position: kw.position)
56
+ end
57
+ end
58
+ end
59
+
60
+ def stop_word?(word)
61
+ @stop_words.stop_word?(word)
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,79 @@
1
+ # frozen_string_literal: true
2
+
3
+ module KeywordRuby
4
+ module Extractors
5
+ class Rake < Base
6
+ def extract(text)
7
+ validate_text!(text)
8
+ return [] if text.nil? || text.strip.empty?
9
+
10
+ sentences = TextProcessing::Tokenizer.sentences(text)
11
+ candidates = extract_candidates(sentences)
12
+ return [] if candidates.empty?
13
+
14
+ word_scores = calculate_word_scores(candidates)
15
+
16
+ keyword_scores = candidates.uniq.map do |phrase|
17
+ words = phrase.split(/\s+/)
18
+ score = words.sum { |w| word_scores[w] || 0.0 }
19
+ Keyword.new(phrase: phrase, score: score)
20
+ end
21
+
22
+ results = keyword_scores.sort.first(@top_n)
23
+ normalize_scores(results)
24
+ end
25
+
26
+ private
27
+
28
+ def extract_candidates(sentences)
29
+ candidates = []
30
+
31
+ sentences.each do |sentence|
32
+ words = TextProcessing::Tokenizer.tokenize(sentence)
33
+ current_phrase = []
34
+
35
+ words.each do |word|
36
+ if stop_word?(word) || word.length < @min_word_length
37
+ if current_phrase.any?
38
+ phrase = current_phrase.join(" ")
39
+ candidates << phrase if current_phrase.length <= @max_length
40
+ current_phrase = []
41
+ end
42
+ else
43
+ current_phrase << word
44
+ end
45
+ end
46
+
47
+ if current_phrase.any? && current_phrase.length <= @max_length
48
+ candidates << current_phrase.join(" ")
49
+ end
50
+ end
51
+
52
+ candidates
53
+ end
54
+
55
+ def calculate_word_scores(candidates)
56
+ word_frequency = Hash.new(0)
57
+ cooccurrence = Hash.new { |h, k| h[k] = Hash.new(0) }
58
+
59
+ candidates.each do |phrase|
60
+ words = phrase.split(/\s+/)
61
+ words.each do |w1|
62
+ word_frequency[w1] += 1
63
+ words.each do |w2|
64
+ cooccurrence[w1][w2] += 1
65
+ end
66
+ end
67
+ end
68
+
69
+ word_scores = {}
70
+ word_frequency.each do |word, freq|
71
+ deg = cooccurrence[word].values.sum
72
+ word_scores[word] = deg.to_f / freq
73
+ end
74
+
75
+ word_scores
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module KeywordRuby
4
+ module Extractors
5
+ class Tfidf < Base
6
+ def initialize(**opts)
7
+ super
8
+ @idf = nil
9
+ @doc_count = 0
10
+ end
11
+
12
+ def fit(documents)
13
+ @doc_count = documents.size
14
+ doc_freq = Hash.new(0)
15
+
16
+ documents.each do |doc|
17
+ words = TextProcessing::Tokenizer.tokenize(doc)
18
+ words.uniq.each { |w| doc_freq[w] += 1 }
19
+ end
20
+
21
+ @idf = {}
22
+ doc_freq.each do |word, df|
23
+ @idf[word] = Math.log(@doc_count.to_f / (1 + df))
24
+ end
25
+
26
+ self
27
+ end
28
+
29
+ def extract(text)
30
+ validate_text!(text)
31
+ return [] if text.nil? || text.strip.empty?
32
+
33
+ words = TextProcessing::Tokenizer.tokenize(text)
34
+ return [] if words.empty?
35
+
36
+ tf = Hash.new(0)
37
+ words.each { |w| tf[w] += 1 }
38
+
39
+ total = words.size.to_f
40
+
41
+ # When no corpus has been fitted, use TF-only (IDF=1 for all terms)
42
+ scored = tf.map do |word, count|
43
+ next if stop_word?(word) || word.length < @min_word_length
44
+
45
+ tf_score = count / total
46
+ idf_score = if @idf
47
+ @idf[word] || Math.log((@doc_count + 1).to_f / 1)
48
+ else
49
+ 1.0
50
+ end
51
+ score = tf_score * idf_score
52
+
53
+ Keyword.new(phrase: word, score: score)
54
+ end.compact
55
+
56
+ results = scored.sort.first(@top_n)
57
+ normalize_scores(results)
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,101 @@
1
+ # frozen_string_literal: true
2
+
3
+ module KeywordRuby
4
+ module Extractors
5
+ class Yake < Base
6
+ def extract(text)
7
+ validate_text!(text)
8
+ return [] if text.nil? || text.strip.empty?
9
+
10
+ words = TextProcessing::Tokenizer.tokenize(text)
11
+ sentences = TextProcessing::Tokenizer.sentences(text)
12
+ return [] if words.empty?
13
+
14
+ word_stats = calculate_features(words, sentences)
15
+ candidates = generate_candidates(words)
16
+ return [] if candidates.empty?
17
+
18
+ scored = candidates.map do |phrase|
19
+ score = score_candidate(phrase, word_stats)
20
+ Keyword.new(phrase: phrase, score: score)
21
+ end
22
+
23
+ max_score = scored.map(&:score).max
24
+ inverted = scored.map do |kw|
25
+ Keyword.new(phrase: kw.phrase, score: max_score - kw.score + 0.001)
26
+ end
27
+
28
+ results = inverted.sort.first(@top_n)
29
+ normalize_scores(results)
30
+ end
31
+
32
+ private
33
+
34
+ def calculate_features(words, sentences)
35
+ total = words.size.to_f
36
+ stats = {}
37
+
38
+ word_freq = Hash.new(0)
39
+ words.each { |w| word_freq[w] += 1 }
40
+
41
+ first_positions = {}
42
+ words.each_with_index do |w, i|
43
+ first_positions[w] ||= i
44
+ end
45
+
46
+ word_freq.each do |word, freq|
47
+ next if stop_word?(word) || word.length < @min_word_length
48
+
49
+ tf = freq / total
50
+ pos = (first_positions[word] || 0) / total
51
+ len_norm = word.length > 3 ? 1.0 : 0.5
52
+
53
+ stats[word] = {
54
+ tf: tf,
55
+ position: pos,
56
+ frequency: freq,
57
+ score: tf * (1.0 + pos) / len_norm
58
+ }
59
+ end
60
+
61
+ stats
62
+ end
63
+
64
+ def generate_candidates(words)
65
+ candidates = Set.new
66
+
67
+ (1..@max_length).each do |n|
68
+ words.each_cons(n) do |gram|
69
+ phrase = gram.join(" ")
70
+ # First and last words must not be stop words
71
+ next if stop_word?(gram.first) || stop_word?(gram.last)
72
+ # First and last words must meet minimum length
73
+ next if gram.first.length < @min_word_length || gram.last.length < @min_word_length
74
+
75
+ candidates << phrase
76
+ end
77
+ end
78
+
79
+ candidates.to_a
80
+ end
81
+
82
+ def score_candidate(phrase, word_stats)
83
+ words = phrase.split(/\s+/)
84
+ # Only score non-stop words; stop words in the middle are neutral (score 1.0 in product)
85
+ scores = words.map do |w|
86
+ if stop_word?(w) || w.length < @min_word_length
87
+ 1.0
88
+ else
89
+ word_stats.dig(w, :score) || 1.0
90
+ end
91
+ end
92
+
93
+ if scores.length == 1
94
+ scores.first
95
+ else
96
+ scores.reduce(:*)
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module KeywordRuby
4
+ class Keyword
5
+ attr_reader :phrase, :score, :position
6
+
7
+ def initialize(phrase:, score:, position: nil)
8
+ @phrase = phrase
9
+ @score = score
10
+ @position = position
11
+ end
12
+
13
+ def to_h
14
+ { phrase: @phrase, score: @score, position: @position }.compact
15
+ end
16
+
17
+ def to_s
18
+ "#{@phrase} (#{@score.round(2)})"
19
+ end
20
+
21
+ def <=>(other)
22
+ other.score <=> @score
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,132 @@
1
+ a
2
+ about
3
+ above
4
+ after
5
+ again
6
+ against
7
+ all
8
+ am
9
+ an
10
+ and
11
+ any
12
+ are
13
+ as
14
+ at
15
+ be
16
+ because
17
+ been
18
+ before
19
+ being
20
+ below
21
+ between
22
+ both
23
+ but
24
+ by
25
+ can
26
+ could
27
+ did
28
+ do
29
+ does
30
+ doing
31
+ down
32
+ during
33
+ each
34
+ few
35
+ for
36
+ from
37
+ further
38
+ get
39
+ got
40
+ had
41
+ has
42
+ have
43
+ having
44
+ he
45
+ her
46
+ here
47
+ hers
48
+ herself
49
+ him
50
+ himself
51
+ his
52
+ how
53
+ i
54
+ if
55
+ in
56
+ into
57
+ is
58
+ it
59
+ its
60
+ itself
61
+ just
62
+ me
63
+ might
64
+ more
65
+ most
66
+ my
67
+ myself
68
+ no
69
+ nor
70
+ not
71
+ of
72
+ off
73
+ on
74
+ once
75
+ only
76
+ or
77
+ other
78
+ ought
79
+ our
80
+ ours
81
+ ourselves
82
+ out
83
+ over
84
+ own
85
+ same
86
+ shall
87
+ she
88
+ should
89
+ so
90
+ some
91
+ such
92
+ than
93
+ that
94
+ the
95
+ their
96
+ theirs
97
+ them
98
+ themselves
99
+ then
100
+ there
101
+ these
102
+ they
103
+ this
104
+ those
105
+ through
106
+ to
107
+ too
108
+ under
109
+ until
110
+ up
111
+ us
112
+ very
113
+ was
114
+ we
115
+ were
116
+ what
117
+ when
118
+ where
119
+ which
120
+ while
121
+ who
122
+ whom
123
+ why
124
+ will
125
+ with
126
+ won
127
+ would
128
+ you
129
+ your
130
+ yours
131
+ yourself
132
+ yourselves
@@ -0,0 +1,242 @@
1
+ ada
2
+ adalah
3
+ adanya
4
+ agar
5
+ akan
6
+ akhirnya
7
+ aku
8
+ amat
9
+ anda
10
+ antara
11
+ apa
12
+ apabila
13
+ apakah
14
+ apalagi
15
+ atas
16
+ atau
17
+ awal
18
+ bahkan
19
+ bahwa
20
+ baik
21
+ banyak
22
+ baru
23
+ bawah
24
+ beberapa
25
+ begini
26
+ begitu
27
+ belum
28
+ benar
29
+ berapa
30
+ beri
31
+ berikan
32
+ berikut
33
+ bersama
34
+ besar
35
+ betul
36
+ biasa
37
+ biasanya
38
+ bila
39
+ bisa
40
+ boleh
41
+ buat
42
+ bukan
43
+ bulan
44
+ cukup
45
+ cuma
46
+ dalam
47
+ dan
48
+ dapat
49
+ dari
50
+ datang
51
+ dekat
52
+ demi
53
+ demikian
54
+ dengan
55
+ depan
56
+ dia
57
+ diantara
58
+ diri
59
+ dong
60
+ dua
61
+ dulu
62
+ empat
63
+ guna
64
+ hal
65
+ hampir
66
+ hanya
67
+ hari
68
+ harus
69
+ hendak
70
+ hingga
71
+ ia
72
+ ini
73
+ itu
74
+ jadi
75
+ jangan
76
+ jauh
77
+ jelas
78
+ jika
79
+ juga
80
+ kalau
81
+ kami
82
+ kamu
83
+ kan
84
+ kapan
85
+ karena
86
+ kata
87
+ ke
88
+ kebetulan
89
+ kecil
90
+ keluar
91
+ kembali
92
+ kemudian
93
+ kenapa
94
+ kepada
95
+ kira
96
+ kita
97
+ kok
98
+ kurang
99
+ lagi
100
+ lah
101
+ lain
102
+ lalu
103
+ lama
104
+ lanjut
105
+ lebih
106
+ lewat
107
+ lima
108
+ luar
109
+ macam
110
+ maka
111
+ makin
112
+ malah
113
+ mampu
114
+ mana
115
+ masa
116
+ masih
117
+ masing
118
+ mau
119
+ maupun
120
+ memang
121
+ memberi
122
+ membuat
123
+ meminta
124
+ mempergunakan
125
+ mempunyai
126
+ memulai
127
+ menambahkan
128
+ menanti
129
+ mengatakan
130
+ mengenai
131
+ menggunakan
132
+ menginginkan
133
+ menjadi
134
+ menjawab
135
+ menuju
136
+ menurut
137
+ merasa
138
+ mereka
139
+ merupakan
140
+ meski
141
+ minta
142
+ mungkin
143
+ nah
144
+ namun
145
+ nanti
146
+ nyaris
147
+ oleh
148
+ pada
149
+ padahal
150
+ paling
151
+ panjang
152
+ para
153
+ pasti
154
+ penting
155
+ per
156
+ pernah
157
+ pertama
158
+ pihak
159
+ pula
160
+ pun
161
+ punya
162
+ rasa
163
+ rata
164
+ rupanya
165
+ saat
166
+ saja
167
+ salah
168
+ sama
169
+ sambil
170
+ sampai
171
+ sana
172
+ sangat
173
+ satu
174
+ saya
175
+ se
176
+ sebab
177
+ sebagai
178
+ sebagian
179
+ sebelum
180
+ sebenarnya
181
+ seberapa
182
+ sebesar
183
+ sebuah
184
+ sedang
185
+ sedikit
186
+ segala
187
+ segera
188
+ sejak
189
+ selain
190
+ selalu
191
+ selama
192
+ seluruh
193
+ semakin
194
+ sementara
195
+ sempat
196
+ semua
197
+ sendiri
198
+ seorang
199
+ seperti
200
+ sering
201
+ serta
202
+ sesuatu
203
+ setelah
204
+ siapa
205
+ siapapun
206
+ sini
207
+ soal
208
+ sudah
209
+ supaya
210
+ tadi
211
+ tak
212
+ tambah
213
+ tanya
214
+ tapi
215
+ tenang
216
+ tengah
217
+ tentang
218
+ tentu
219
+ tepat
220
+ terakhir
221
+ terdapat
222
+ terhadap
223
+ terlalu
224
+ termasuk
225
+ ternyata
226
+ tertentu
227
+ terus
228
+ tetap
229
+ tetapi
230
+ tiap
231
+ tiba
232
+ tidak
233
+ tiga
234
+ toh
235
+ turut
236
+ untuk
237
+ usai
238
+ waduh
239
+ wah
240
+ waktu
241
+ walaupun
242
+ yang
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ module KeywordRuby
4
+ module TextProcessing
5
+ class SentenceSplitter
6
+ def self.split(text)
7
+ sentences = text.split(/(?<=[.!?])\s+/)
8
+ sentences.map(&:strip).reject(&:empty?)
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module KeywordRuby
4
+ module TextProcessing
5
+ class StopWords
6
+ STOP_WORDS_DIR = File.join(__dir__, "..", "stop_words")
7
+
8
+ def initialize(language: :en)
9
+ @language = language
10
+ @words = load_stop_words
11
+ end
12
+
13
+ def stop_word?(word)
14
+ @words.include?(word.downcase)
15
+ end
16
+
17
+ def filter(words)
18
+ words.reject { |w| stop_word?(w) }
19
+ end
20
+
21
+ private
22
+
23
+ def load_stop_words
24
+ path = File.join(STOP_WORDS_DIR, "#{@language}.txt")
25
+ return Set.new unless File.exist?(path)
26
+
27
+ Set.new(File.readlines(path, chomp: true).map(&:downcase).reject(&:empty?))
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module KeywordRuby
4
+ module TextProcessing
5
+ class Tokenizer
6
+ # Common English contractions mapped to their expanded forms
7
+ CONTRACTIONS = {
8
+ "n't" => " not",
9
+ "'re" => " are",
10
+ "'ve" => " have",
11
+ "'ll" => " will",
12
+ "'d" => " would",
13
+ "'m" => " am",
14
+ "'s" => "", # possessive or "is" -- just remove
15
+ }.freeze
16
+
17
+ def self.tokenize(text)
18
+ normalized = text.downcase
19
+ # Normalize curly/smart apostrophes to straight
20
+ normalized = normalized.gsub(/[\u2018\u2019\u2032]/, "'")
21
+ # Expand contractions before stripping punctuation
22
+ CONTRACTIONS.each do |suffix, expansion|
23
+ normalized = normalized.gsub(suffix, expansion)
24
+ end
25
+ normalized
26
+ .gsub(/[^\p{L}\p{N}\s-]/, " ")
27
+ .split(/\s+/)
28
+ .reject(&:empty?)
29
+ end
30
+
31
+ def self.sentences(text)
32
+ text.split(/[.!?\n]+/)
33
+ .map(&:strip)
34
+ .reject(&:empty?)
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module KeywordRuby
4
+ VERSION = "0.1.1"
5
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "keyword_ruby/version"
4
+ require_relative "keyword_ruby/configuration"
5
+ require_relative "keyword_ruby/keyword"
6
+ require_relative "keyword_ruby/text_processing/stop_words"
7
+ require_relative "keyword_ruby/text_processing/tokenizer"
8
+ require_relative "keyword_ruby/text_processing/sentence_splitter"
9
+ require_relative "keyword_ruby/extractors/base"
10
+ require_relative "keyword_ruby/extractors/rake"
11
+ require_relative "keyword_ruby/extractors/yake"
12
+ require_relative "keyword_ruby/extractors/tfidf"
13
+
14
+ module KeywordRuby
15
+ class Error < StandardError; end
16
+
17
+ class << self
18
+ def configuration
19
+ @configuration ||= Configuration.new
20
+ end
21
+
22
+ def configure
23
+ yield(configuration)
24
+ end
25
+
26
+ def reset_configuration!
27
+ @configuration = Configuration.new
28
+ end
29
+
30
+ def extract(text, algorithm: nil, language: nil, top_n: nil, normalize: true)
31
+ algo = algorithm || configuration.default_algorithm
32
+
33
+ extractor = case algo
34
+ when :rake then Extractors::Rake.new(language: language, top_n: top_n, normalize: normalize)
35
+ when :yake then Extractors::Yake.new(language: language, top_n: top_n, normalize: normalize)
36
+ when :tfidf then Extractors::Tfidf.new(language: language, top_n: top_n, normalize: normalize)
37
+ else raise ArgumentError, "Unknown algorithm: #{algo}. Supported: :rake, :yake, :tfidf"
38
+ end
39
+
40
+ extractor.extract(text)
41
+ end
42
+
43
+ def extract_batch(documents, **opts)
44
+ documents.map { |doc| extract(doc, **opts) }
45
+ end
46
+ end
47
+ end
metadata ADDED
@@ -0,0 +1,90 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: keyword-ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Johannes Dwi Cahyo
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: minitest
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '5.0'
19
+ type: :development
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '5.0'
26
+ - !ruby/object:Gem::Dependency
27
+ name: rake
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '13.0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: '13.0'
40
+ description: Pure Ruby keyword and keyphrase extraction library. Implements RAKE,
41
+ YAKE, and TF-IDF algorithms for extracting keywords from text.
42
+ email:
43
+ - johannes@example.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - CHANGELOG.md
49
+ - LICENSE
50
+ - README.md
51
+ - Rakefile
52
+ - keyword-ruby.gemspec
53
+ - lib/keyword_ruby.rb
54
+ - lib/keyword_ruby/configuration.rb
55
+ - lib/keyword_ruby/extractors/base.rb
56
+ - lib/keyword_ruby/extractors/rake.rb
57
+ - lib/keyword_ruby/extractors/tfidf.rb
58
+ - lib/keyword_ruby/extractors/yake.rb
59
+ - lib/keyword_ruby/keyword.rb
60
+ - lib/keyword_ruby/stop_words/en.txt
61
+ - lib/keyword_ruby/stop_words/id.txt
62
+ - lib/keyword_ruby/text_processing/sentence_splitter.rb
63
+ - lib/keyword_ruby/text_processing/stop_words.rb
64
+ - lib/keyword_ruby/text_processing/tokenizer.rb
65
+ - lib/keyword_ruby/version.rb
66
+ homepage: https://github.com/johannesdwicahyo/keyword-ruby
67
+ licenses:
68
+ - MIT
69
+ metadata:
70
+ homepage_uri: https://github.com/johannesdwicahyo/keyword-ruby
71
+ source_code_uri: https://github.com/johannesdwicahyo/keyword-ruby
72
+ changelog_uri: https://github.com/johannesdwicahyo/keyword-ruby/blob/main/CHANGELOG.md
73
+ rdoc_options: []
74
+ require_paths:
75
+ - lib
76
+ required_ruby_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: 3.0.0
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ requirements: []
87
+ rubygems_version: 3.6.9
88
+ specification_version: 4
89
+ summary: Keyword extraction for Ruby using RAKE, YAKE, and TF-IDF
90
+ test_files: []