keyphrase 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1d9478f7e5c47a826e90eae00884034ff4078262ce6b46a8f6c0a2070824b3c7
4
- data.tar.gz: f14706384868b463ce8b968dbb4b6dd7e37bd6214d1d02b68ceb5d629930dec9
3
+ metadata.gz: 56611558acf8336a81d11dc0b6cd1168ed0c008f48822efed61741b2448f3a65
4
+ data.tar.gz: 610725d8f12dbda7d041160bd98a976732f7a748644808e73afcc91f954c114d
5
5
  SHA512:
6
- metadata.gz: 776507b887f30458a163f15999fcd0c57543b32c7acb8d604372180cc87f740a7c48d240311a9d5dbb9b0bdb0f46c5d02985332d622e10c85b2250bfd8f3af00
7
- data.tar.gz: d408f218b406c45fe47c9315633aeb0fbe364cd5a3d597f2b77433bb3dc74289bfe8b90e5c5a187117d677b236f47763478fea4857c5619ce99cfb7f03946f65
6
+ metadata.gz: e996bfb9191c68a4df50b9ad52d4deaf553141c097d4e3172b9dae24246ef4098d47d8e67ef2beaa27acc6ef94e497ddb0a804307dd1d80b64cb740d9c8fb310
7
+ data.tar.gz: 826ae9a9d3d3f1c1ffbcb381c7be7b3f6ed8410b89846494cd490e5ecc0dfddccb74f59088ef70c3f4b4e07dd5e9ca2b4a497d3d1ec805eb11a0179753ecda54
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Keyphrase
4
- VERSION = "0.1.1"
4
+ VERSION = "0.1.3"
5
5
  end
data/lib/keyphrase.rb CHANGED
@@ -7,7 +7,7 @@ class Keyphrase
7
7
  autoload :Stoplist, "keyphrase/stoplist"
8
8
 
9
9
  CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
10
- BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-]+\b|\'/ # remove words with no letters, ie 123.23.12. And last chance to remove '
10
+ BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
11
11
  CLEAN_SPACES_REGEX = /\s+/
12
12
  SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
13
13
 
@@ -66,15 +66,23 @@ class Keyphrase
66
66
  def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
67
67
  phrases = Array.new
68
68
 
69
- filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
69
+ # first clean by removing unwanted special chars
70
+ # second remove all stop words
71
+ # third, remove uncaught stopwords in second pass
72
+ # using a | as an easy way to divide the text by stopwords
73
+ filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, " | ").gsub(stopwords_regex, "|") }
70
74
 
71
75
  filtered_sentences.each do |parts|
72
76
  parts.split("|").each do |part|
77
+ next if part.empty?
78
+
79
+ # remove blacklisted things, like 1234.45.34
80
+ # clean up spacing between words
73
81
  part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip
74
82
 
75
- if !part.empty?
76
- phrases.push part
77
- end
83
+ next if part.empty?
84
+
85
+ phrases.push part
78
86
  end
79
87
  end
80
88
 
@@ -118,7 +126,7 @@ class Keyphrase
118
126
  # 4
119
127
  def generateCandidateKeywordScores phrases, scores, position_bonus
120
128
  candidates = Hash.new 0
121
- word_index = 0
129
+ phrase_index = 0
122
130
 
123
131
  phrases.each do |phrase|
124
132
  words = seperateWords(phrase)
@@ -126,13 +134,15 @@ class Keyphrase
126
134
  words.each do |word|
127
135
  score += scores[word]
128
136
 
129
- # Normalize the score based on the position
130
- if position_bonus
131
- normalized_score = 1.0 / (word_index + 1)
132
- score += normalized_score
133
- word_index += 1
134
- end
135
137
  end
138
+
139
+ # Boost score based on the phrase position in the text
140
+ if position_bonus
141
+ normalized_score = 1.0 / (phrase_index + 1)
142
+ score += normalized_score
143
+ phrase_index += 1
144
+ end
145
+
136
146
  candidates[phrase] = score
137
147
  end
138
148
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: keyphrase
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben D'Angelo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-06 00:00:00.000000000 Z
11
+ date: 2023-12-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec