keyword-ruby 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 433764acb546b2b35c3e05f56095ec7ba4642a8c67591d9466fb1a6a8ae47bdf
4
- data.tar.gz: e83e1a3e1b454b584efccf44a592c7868e788ff756e946e4c7631e696982cc09
3
+ metadata.gz: ae61b0d85e3b754d30502e323aac159abbe570b4b30341b8b8c1d86f0e3f65c2
4
+ data.tar.gz: 729dc165b59cb53568032f884ccc9eae34c80958cb5d87698bcceff0eb0f6061
5
5
  SHA512:
6
- metadata.gz: b5dbf9ec6be9a3b3b657c03b6397df41de8152a02ab8539f5b62c435de8dd51ba80ca6d16645f50971d5bb48e9c2e1351a9e63081d61f89251c2d257f23cef54
7
- data.tar.gz: 0d51a0fe597f68dff3231881ac131d69b85ee210bf638cdb95d8e8e8d1b2cdfab4f4e50c8f15d4c6271e394d647e5f4778e8a104270ea17a851c70d88cafc942
6
+ metadata.gz: a8bf751b18fff5031917192ebc67cd30da67705d72ebc30d2b31c4e03b488223f4572f99c36261e8dd81c66491f212e9db2ce3e456e8df89efb3bb642d3e7790
7
+ data.tar.gz: 1a1fa33626b2946b913c93eb984dc9854cb65c65b0b208d5959e4aac9dcb255f80dfbf668b1219cafa803acb227bed075b26031af305fcf15fb263bc5637a748
data/README.md CHANGED
@@ -1,11 +1,11 @@
1
1
  # keyword-ruby
2
2
 
3
- Pure Ruby keyword and keyphrase extraction using RAKE, YAKE, and TF-IDF algorithms.
3
+ Keyword extraction for Ruby using RAKE, YAKE, and TF-IDF algorithms. Extract the most relevant terms from any text.
4
4
 
5
5
  ## Installation
6
6
 
7
7
  ```ruby
8
- gem "keyword-ruby", "~> 0.1"
8
+ gem "keyword-ruby"
9
9
  ```
10
10
 
11
11
  ## Usage
@@ -13,22 +13,31 @@ gem "keyword-ruby", "~> 0.1"
13
13
  ```ruby
14
14
  require "keyword_ruby"
15
15
 
16
- # RAKE (default)
17
- keywords = KeywordRuby.extract("Your text here...")
18
- keywords.each { |kw| puts "#{kw.phrase}: #{kw.score}" }
16
+ text = "Ruby is a dynamic programming language focused on simplicity and productivity."
19
17
 
20
- # YAKE
21
- keywords = KeywordRuby.extract(text, algorithm: :yake)
18
+ # RAKE (Rapid Automatic Keyword Extraction)
19
+ keywords = KeywordRuby.extract(text, algorithm: :rake, top_n: 5)
22
20
 
23
- # TF-IDF (with corpus)
21
+ # YAKE (Yet Another Keyword Extractor)
22
+ keywords = KeywordRuby.extract(text, algorithm: :yake, top_n: 5)
23
+
24
+ # TF-IDF
24
25
  extractor = KeywordRuby::Extractors::Tfidf.new
25
- extractor.fit(corpus_documents)
26
- keywords = extractor.extract(text)
26
+ extractor.fit(corpus) # optional: fit on a corpus
27
+ keywords = extractor.extract(text, top_n: 5)
27
28
 
28
- # Batch extraction
29
- results = KeywordRuby.extract_batch(documents, algorithm: :rake, top_n: 5)
29
+ keywords.each { |kw| puts "#{kw.text}: #{kw.score}" }
30
30
  ```
31
31
 
32
+ ## Features
33
+
34
+ - RAKE with proper co-occurrence degree calculation
35
+ - YAKE with stop word handling in multi-word phrases
36
+ - TF-IDF with optional corpus fitting (falls back to TF-only)
37
+ - Score normalization to 0.0-1.0 range
38
+ - English contraction expansion (don't → do not)
39
+ - Input validation and language support
40
+
32
41
  ## License
33
42
 
34
43
  MIT
@@ -3,7 +3,7 @@
3
3
  module KeywordRuby
4
4
  class Configuration
5
5
  attr_accessor :default_algorithm, :default_language, :default_top_n,
6
- :max_phrase_length, :min_word_length
6
+ :max_phrase_length, :min_word_length, :custom_stop_words
7
7
 
8
8
  def initialize
9
9
  @default_algorithm = :rake
@@ -11,6 +11,7 @@ module KeywordRuby
11
11
  @default_top_n = 10
12
12
  @max_phrase_length = 4
13
13
  @min_word_length = 2
14
+ @custom_stop_words = []
14
15
  end
15
16
  end
16
17
  end
@@ -3,7 +3,7 @@
3
3
  module KeywordRuby
4
4
  module Extractors
5
5
  class Base
6
- SUPPORTED_LANGUAGES = %i[en id].freeze
6
+ SUPPORTED_LANGUAGES = %i[en id ms nl fr de es pt ar ja].freeze
7
7
 
8
8
  def initialize(language: nil, top_n: nil, max_length: nil, min_word_length: nil, normalize: true)
9
9
  config = KeywordRuby.configuration
@@ -0,0 +1,107 @@
1
+ # frozen_string_literal: true
2
+
3
+ module KeywordRuby
4
+ module Extractors
5
+ class TextRank < Base
6
+ DEFAULT_DAMPING = 0.85
7
+ DEFAULT_ITERATIONS = 30
8
+ DEFAULT_CONVERGENCE = 0.0001
9
+
10
+ def initialize(damping: DEFAULT_DAMPING, iterations: DEFAULT_ITERATIONS, **opts)
11
+ super(**opts)
12
+ @damping = damping
13
+ @iterations = iterations
14
+ end
15
+
16
+ def extract(text)
17
+ validate_text!(text)
18
+ return [] if text.nil? || text.strip.empty?
19
+
20
+ words = TextProcessing::Tokenizer.tokenize(text)
21
+ .reject { |w| stop_word?(w) || w.length < @min_word_length }
22
+ return [] if words.empty?
23
+
24
+ # Build co-occurrence graph (window size = 4)
25
+ graph = build_graph(words, window: 4)
26
+ return [] if graph.empty?
27
+
28
+ # Run PageRank
29
+ scores = pagerank(graph)
30
+
31
+ # Generate multi-word candidates
32
+ all_words = TextProcessing::Tokenizer.tokenize(text)
33
+ candidates = generate_phrases(all_words, scores)
34
+
35
+ results = candidates.sort.first(@top_n)
36
+ normalize_scores(results)
37
+ end
38
+
39
+ private
40
+
41
+ def build_graph(words, window: 4)
42
+ graph = Hash.new { |h, k| h[k] = Hash.new(0.0) }
43
+
44
+ words.each_cons(window) do |group|
45
+ group.uniq.combination(2) do |a, b|
46
+ graph[a][b] += 1.0
47
+ graph[b][a] += 1.0
48
+ end
49
+ end
50
+
51
+ graph
52
+ end
53
+
54
+ def pagerank(graph)
55
+ nodes = graph.keys
56
+ n = nodes.size.to_f
57
+ scores = nodes.map { |node| [node, 1.0 / n] }.to_h
58
+
59
+ @iterations.times do
60
+ new_scores = {}
61
+ max_diff = 0.0
62
+
63
+ nodes.each do |node|
64
+ rank = (1.0 - @damping) / n
65
+ neighbors = graph[node]
66
+
67
+ neighbors.each do |neighbor, weight|
68
+ out_weight = graph[neighbor].values.sum
69
+ rank += @damping * (scores[neighbor] || 0.0) * weight / out_weight if out_weight > 0
70
+ end
71
+
72
+ new_scores[node] = rank
73
+ max_diff = [max_diff, (rank - (scores[node] || 0.0)).abs].max
74
+ end
75
+
76
+ scores = new_scores
77
+ break if max_diff < DEFAULT_CONVERGENCE
78
+ end
79
+
80
+ scores
81
+ end
82
+
83
+ def generate_phrases(words, word_scores)
84
+ phrases = {}
85
+
86
+ # Single words
87
+ word_scores.each do |word, score|
88
+ phrases[word] = score
89
+ end
90
+
91
+ # Multi-word phrases (2-4 words)
92
+ (2..@max_length).each do |n|
93
+ words.each_cons(n) do |gram|
94
+ next if stop_word?(gram.first) || stop_word?(gram.last)
95
+ next if gram.first.length < @min_word_length || gram.last.length < @min_word_length
96
+
97
+ phrase = gram.join(" ")
98
+ score = gram.sum { |w| word_scores[w] || 0.0 }
99
+ phrases[phrase] = [phrases[phrase] || 0.0, score].max
100
+ end
101
+ end
102
+
103
+ phrases.map { |phrase, score| Keyword.new(phrase: phrase, score: score) }
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,29 @@
1
+ في
2
+ من
3
+ على
4
+ إلى
5
+ أن
6
+ هذا
7
+ هذه
8
+ التي
9
+ الذي
10
+ عن
11
+ مع
12
+ كان
13
+ لا
14
+ ما
15
+ هو
16
+ هي
17
+ قد
18
+ بين
19
+ أو
20
+ عند
21
+ بعد
22
+ كل
23
+ ذلك
24
+ تلك
25
+ حتى
26
+ إذا
27
+ ثم
28
+ لم
29
+ لن
@@ -0,0 +1,39 @@
1
+ der
2
+ die
3
+ das
4
+ und
5
+ ist
6
+ ein
7
+ eine
8
+ in
9
+ den
10
+ von
11
+ zu
12
+ mit
13
+ auf
14
+ für
15
+ nicht
16
+ sich
17
+ des
18
+ dem
19
+ als
20
+ auch
21
+ es
22
+ an
23
+ er
24
+ so
25
+ dass
26
+ aus
27
+ bei
28
+ nach
29
+ wie
30
+ über
31
+ hat
32
+ oder
33
+ noch
34
+ aber
35
+ um
36
+ wenn
37
+ kann
38
+ nur
39
+ werden
@@ -0,0 +1,37 @@
1
+ el
2
+ la
3
+ los
4
+ las
5
+ de
6
+ del
7
+ en
8
+ un
9
+ una
10
+ que
11
+ es
12
+ por
13
+ con
14
+ no
15
+ para
16
+ se
17
+ al
18
+ lo
19
+ su
20
+ como
21
+ más
22
+ pero
23
+ sus
24
+ le
25
+ ya
26
+ fue
27
+ este
28
+ ha
29
+
30
+ porque
31
+ esta
32
+ entre
33
+ cuando
34
+ muy
35
+ sin
36
+ sobre
37
+ también
@@ -0,0 +1,44 @@
1
+ le
2
+ la
3
+ les
4
+ de
5
+ des
6
+ du
7
+ un
8
+ une
9
+ et
10
+ est
11
+ en
12
+ que
13
+ qui
14
+ dans
15
+ ce
16
+ il
17
+ ne
18
+ sur
19
+ se
20
+ pas
21
+ plus
22
+ par
23
+ je
24
+ avec
25
+ tout
26
+ faire
27
+ son
28
+ au
29
+ mais
30
+ nous
31
+ ont
32
+ cette
33
+ ou
34
+ été
35
+ aussi
36
+ leur
37
+ bien
38
+ peut
39
+ même
40
+ ces
41
+ quand
42
+ entre
43
+ notre
44
+ après
@@ -0,0 +1,40 @@
1
+
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+ ある
14
+ いる
15
+
16
+ する
17
+ から
18
+
19
+ こと
20
+ として
21
+
22
+
23
+ れる
24
+ など
25
+ なっ
26
+ ない
27
+ この
28
+ ため
29
+ その
30
+ あっ
31
+ よう
32
+ また
33
+ もの
34
+ という
35
+ あり
36
+ まで
37
+ られ
38
+ なる
39
+
40
+
@@ -0,0 +1,29 @@
1
+ yang
2
+ dan
3
+ di
4
+ ini
5
+ itu
6
+ dengan
7
+ untuk
8
+ dari
9
+ adalah
10
+ pada
11
+ tidak
12
+ dalam
13
+ akan
14
+ telah
15
+ ke
16
+ oleh
17
+ ada
18
+ juga
19
+ saya
20
+ mereka
21
+ sudah
22
+ boleh
23
+ kami
24
+ kita
25
+ semua
26
+ antara
27
+ lebih
28
+ atas
29
+ bagi
@@ -0,0 +1,29 @@
1
+ de
2
+ het
3
+ een
4
+ van
5
+ in
6
+ en
7
+ is
8
+ dat
9
+ op
10
+ te
11
+ voor
12
+ met
13
+ zijn
14
+ er
15
+ niet
16
+ ook
17
+ maar
18
+ door
19
+ als
20
+ aan
21
+ bij
22
+ of
23
+ om
24
+ nog
25
+ uit
26
+ dan
27
+ naar
28
+ over
29
+ tot
@@ -0,0 +1,36 @@
1
+ de
2
+ que
3
+ não
4
+ do
5
+ da
6
+ em
7
+ um
8
+ para
9
+ com
10
+ uma
11
+ os
12
+ no
13
+ se
14
+ na
15
+ por
16
+ mais
17
+ as
18
+ dos
19
+ como
20
+ mas
21
+ ao
22
+ ele
23
+ das
24
+ tem
25
+ seu
26
+ sua
27
+ ou
28
+ quando
29
+ muito
30
+ nos
31
+
32
+ também
33
+
34
+ pelo
35
+ pela
36
+ até
@@ -5,9 +5,11 @@ module KeywordRuby
5
5
  class StopWords
6
6
  STOP_WORDS_DIR = File.join(__dir__, "..", "stop_words")
7
7
 
8
- def initialize(language: :en)
8
+ def initialize(language: :en, strict: false)
9
9
  @language = language
10
+ @strict = strict
10
11
  @words = load_stop_words
12
+ add_custom_stop_words
11
13
  end
12
14
 
13
15
  def stop_word?(word)
@@ -18,14 +20,26 @@ module KeywordRuby
18
20
  words.reject { |w| stop_word?(w) }
19
21
  end
20
22
 
23
+ def add(words)
24
+ words.each { |w| @words.add(w.downcase) }
25
+ end
26
+
21
27
  private
22
28
 
23
29
  def load_stop_words
24
30
  path = File.join(STOP_WORDS_DIR, "#{@language}.txt")
31
+ if !File.exist?(path) && @strict
32
+ raise KeywordRuby::Error, "Stop word file not found for language: #{@language}"
33
+ end
25
34
  return Set.new unless File.exist?(path)
26
35
 
27
36
  Set.new(File.readlines(path, chomp: true).map(&:downcase).reject(&:empty?))
28
37
  end
38
+
39
+ def add_custom_stop_words
40
+ custom = KeywordRuby.configuration.custom_stop_words
41
+ custom.each { |w| @words.add(w.downcase) } if custom
42
+ end
29
43
  end
30
44
  end
31
45
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module KeywordRuby
4
- VERSION = "0.1.1"
4
+ VERSION = "0.2.0"
5
5
  end
data/lib/keyword_ruby.rb CHANGED
@@ -5,15 +5,17 @@ require_relative "keyword_ruby/configuration"
5
5
  require_relative "keyword_ruby/keyword"
6
6
  require_relative "keyword_ruby/text_processing/stop_words"
7
7
  require_relative "keyword_ruby/text_processing/tokenizer"
8
- require_relative "keyword_ruby/text_processing/sentence_splitter"
9
8
  require_relative "keyword_ruby/extractors/base"
10
9
  require_relative "keyword_ruby/extractors/rake"
11
10
  require_relative "keyword_ruby/extractors/yake"
12
11
  require_relative "keyword_ruby/extractors/tfidf"
12
+ require_relative "keyword_ruby/extractors/textrank"
13
13
 
14
14
  module KeywordRuby
15
15
  class Error < StandardError; end
16
16
 
17
+ ALGORITHMS = %i[rake yake tfidf textrank].freeze
18
+
17
19
  class << self
18
20
  def configuration
19
21
  @configuration ||= Configuration.new
@@ -29,19 +31,33 @@ module KeywordRuby
29
31
 
30
32
  def extract(text, algorithm: nil, language: nil, top_n: nil, normalize: true)
31
33
  algo = algorithm || configuration.default_algorithm
34
+ build_extractor(algo, language: language, top_n: top_n, normalize: normalize).extract(text)
35
+ end
32
36
 
33
- extractor = case algo
34
- when :rake then Extractors::Rake.new(language: language, top_n: top_n, normalize: normalize)
35
- when :yake then Extractors::Yake.new(language: language, top_n: top_n, normalize: normalize)
36
- when :tfidf then Extractors::Tfidf.new(language: language, top_n: top_n, normalize: normalize)
37
- else raise ArgumentError, "Unknown algorithm: #{algo}. Supported: :rake, :yake, :tfidf"
38
- end
37
+ def extract_batch(documents, algorithm: nil, language: nil, top_n: nil, normalize: true)
38
+ algo = algorithm || configuration.default_algorithm
39
39
 
40
- extractor.extract(text)
40
+ if algo == :tfidf
41
+ # TF-IDF benefits from shared corpus state
42
+ extractor = build_extractor(algo, language: language, top_n: top_n, normalize: normalize)
43
+ extractor.fit(documents)
44
+ documents.map { |doc| extractor.extract(doc) }
45
+ else
46
+ extractor = build_extractor(algo, language: language, top_n: top_n, normalize: normalize)
47
+ documents.map { |doc| extractor.extract(doc) }
48
+ end
41
49
  end
42
50
 
43
- def extract_batch(documents, **opts)
44
- documents.map { |doc| extract(doc, **opts) }
51
+ private
52
+
53
+ def build_extractor(algo, **opts)
54
+ case algo
55
+ when :rake then Extractors::Rake.new(**opts)
56
+ when :yake then Extractors::Yake.new(**opts)
57
+ when :tfidf then Extractors::Tfidf.new(**opts)
58
+ when :textrank then Extractors::TextRank.new(**opts)
59
+ else raise ArgumentError, "Unknown algorithm: #{algo}. Supported: #{ALGORITHMS.join(', ')}"
60
+ end
45
61
  end
46
62
  end
47
63
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: keyword-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Johannes Dwi Cahyo
@@ -54,11 +54,20 @@ files:
54
54
  - lib/keyword_ruby/configuration.rb
55
55
  - lib/keyword_ruby/extractors/base.rb
56
56
  - lib/keyword_ruby/extractors/rake.rb
57
+ - lib/keyword_ruby/extractors/textrank.rb
57
58
  - lib/keyword_ruby/extractors/tfidf.rb
58
59
  - lib/keyword_ruby/extractors/yake.rb
59
60
  - lib/keyword_ruby/keyword.rb
61
+ - lib/keyword_ruby/stop_words/ar.txt
62
+ - lib/keyword_ruby/stop_words/de.txt
60
63
  - lib/keyword_ruby/stop_words/en.txt
64
+ - lib/keyword_ruby/stop_words/es.txt
65
+ - lib/keyword_ruby/stop_words/fr.txt
61
66
  - lib/keyword_ruby/stop_words/id.txt
67
+ - lib/keyword_ruby/stop_words/ja.txt
68
+ - lib/keyword_ruby/stop_words/ms.txt
69
+ - lib/keyword_ruby/stop_words/nl.txt
70
+ - lib/keyword_ruby/stop_words/pt.txt
62
71
  - lib/keyword_ruby/text_processing/sentence_splitter.rb
63
72
  - lib/keyword_ruby/text_processing/stop_words.rb
64
73
  - lib/keyword_ruby/text_processing/tokenizer.rb