keyphrase 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1d9478f7e5c47a826e90eae00884034ff4078262ce6b46a8f6c0a2070824b3c7
4
- data.tar.gz: f14706384868b463ce8b968dbb4b6dd7e37bd6214d1d02b68ceb5d629930dec9
3
+ metadata.gz: be1cbe539ab9b72b5eeaa0f8f648853b3953f06e0cbe96e9e1870b01695bde10
4
+ data.tar.gz: 8ba868ba0e54ffca3d02ec004b8bb6ed92a2c894a583e1a50205da0f43b39cc3
5
5
  SHA512:
6
- metadata.gz: 776507b887f30458a163f15999fcd0c57543b32c7acb8d604372180cc87f740a7c48d240311a9d5dbb9b0bdb0f46c5d02985332d622e10c85b2250bfd8f3af00
7
- data.tar.gz: d408f218b406c45fe47c9315633aeb0fbe364cd5a3d597f2b77433bb3dc74289bfe8b90e5c5a187117d677b236f47763478fea4857c5619ce99cfb7f03946f65
6
+ metadata.gz: 19cf0a7f7752f6cf1bcba3f65687d52897dd6dc4d8fe068701cc4b847d2480dd00d851bee0cb313b9c9be736806098655a6f65ff2de341e797cf2cf45f15f685
7
+ data.tar.gz: cb5ad4e66d3413facc873691215f767cb3c9dff1af5dc301fd460dd26d64ce9b5f47c53c08520cf418eb5bc4973a2dee7e0fdd6590c9767205ee106b91a938b4
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Keyphrase
4
- VERSION = "0.1.1"
4
+ VERSION = "0.1.2"
5
5
  end
data/lib/keyphrase.rb CHANGED
@@ -66,15 +66,23 @@ class Keyphrase
66
66
  def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
67
67
  phrases = Array.new
68
68
 
69
- filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
69
+ # first clean by removing unwanted special chars
70
+ # second remove all stop words
71
+ # third, remove uncaught stopwords in second pass
72
+ # using a | as an easy way to divide the text by stopwords
73
+ filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, " | ").gsub(stopwords_regex, "|") }
70
74
 
71
75
  filtered_sentences.each do |parts|
72
76
  parts.split("|").each do |part|
77
+ next if part.empty?
78
+
79
+ # remove blacklisted things, like 1234.45.34
80
+ # clean up spacing between words
73
81
  part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip
74
82
 
75
- if !part.empty?
76
- phrases.push part
77
- end
83
+ next if part.empty?
84
+
85
+ phrases.push part
78
86
  end
79
87
  end
80
88
 
@@ -118,7 +126,7 @@ class Keyphrase
118
126
  # 4
119
127
  def generateCandidateKeywordScores phrases, scores, position_bonus
120
128
  candidates = Hash.new 0
121
- word_index = 0
129
+ phrase_index = 0
122
130
 
123
131
  phrases.each do |phrase|
124
132
  words = seperateWords(phrase)
@@ -126,13 +134,15 @@ class Keyphrase
126
134
  words.each do |word|
127
135
  score += scores[word]
128
136
 
129
- # Normalize the score based on the position
130
- if position_bonus
131
- normalized_score = 1.0 / (word_index + 1)
132
- score += normalized_score
133
- word_index += 1
134
- end
135
137
  end
138
+
139
+ # Boost score based on the phrase position in the text
140
+ if position_bonus
141
+ normalized_score = 1.0 / (phrase_index + 1)
142
+ score += normalized_score
143
+ phrase_index += 1
144
+ end
145
+
136
146
  candidates[phrase] = score
137
147
  end
138
148
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: keyphrase
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben D'Angelo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-06 00:00:00.000000000 Z
11
+ date: 2023-12-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec