keyphrase 0.1.1 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1d9478f7e5c47a826e90eae00884034ff4078262ce6b46a8f6c0a2070824b3c7
4
- data.tar.gz: f14706384868b463ce8b968dbb4b6dd7e37bd6214d1d02b68ceb5d629930dec9
3
+ metadata.gz: 56611558acf8336a81d11dc0b6cd1168ed0c008f48822efed61741b2448f3a65
4
+ data.tar.gz: 610725d8f12dbda7d041160bd98a976732f7a748644808e73afcc91f954c114d
5
5
  SHA512:
6
- metadata.gz: 776507b887f30458a163f15999fcd0c57543b32c7acb8d604372180cc87f740a7c48d240311a9d5dbb9b0bdb0f46c5d02985332d622e10c85b2250bfd8f3af00
7
- data.tar.gz: d408f218b406c45fe47c9315633aeb0fbe364cd5a3d597f2b77433bb3dc74289bfe8b90e5c5a187117d677b236f47763478fea4857c5619ce99cfb7f03946f65
6
+ metadata.gz: e996bfb9191c68a4df50b9ad52d4deaf553141c097d4e3172b9dae24246ef4098d47d8e67ef2beaa27acc6ef94e497ddb0a804307dd1d80b64cb740d9c8fb310
7
+ data.tar.gz: 826ae9a9d3d3f1c1ffbcb381c7be7b3f6ed8410b89846494cd490e5ecc0dfddccb74f59088ef70c3f4b4e07dd5e9ca2b4a497d3d1ec805eb11a0179753ecda54
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Keyphrase
4
- VERSION = "0.1.1"
4
+ VERSION = "0.1.3"
5
5
  end
data/lib/keyphrase.rb CHANGED
@@ -7,7 +7,7 @@ class Keyphrase
7
7
  autoload :Stoplist, "keyphrase/stoplist"
8
8
 
9
9
  CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
10
- BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-]+\b|\'/ # remove words with no letters, ie 123.23.12. And last chance to remove '
10
+ BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
11
11
  CLEAN_SPACES_REGEX = /\s+/
12
12
  SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
13
13
 
@@ -66,15 +66,23 @@ class Keyphrase
66
66
  def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
67
67
  phrases = Array.new
68
68
 
69
- filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
69
+ # first clean by removing unwanted special chars
70
+ # second remove all stop words
71
+ # third, remove uncaught stopwords in second pass
72
+ # using a | as an easy way to divide the text by stopwords
73
+ filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, " | ").gsub(stopwords_regex, "|") }
70
74
 
71
75
  filtered_sentences.each do |parts|
72
76
  parts.split("|").each do |part|
77
+ next if part.empty?
78
+
79
+ # remove blacklisted things, like 1234.45.34
80
+ # clean up spacing between words
73
81
  part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip
74
82
 
75
- if !part.empty?
76
- phrases.push part
77
- end
83
+ next if part.empty?
84
+
85
+ phrases.push part
78
86
  end
79
87
  end
80
88
 
@@ -118,7 +126,7 @@ class Keyphrase
118
126
  # 4
119
127
  def generateCandidateKeywordScores phrases, scores, position_bonus
120
128
  candidates = Hash.new 0
121
- word_index = 0
129
+ phrase_index = 0
122
130
 
123
131
  phrases.each do |phrase|
124
132
  words = seperateWords(phrase)
@@ -126,13 +134,15 @@ class Keyphrase
126
134
  words.each do |word|
127
135
  score += scores[word]
128
136
 
129
- # Normalize the score based on the position
130
- if position_bonus
131
- normalized_score = 1.0 / (word_index + 1)
132
- score += normalized_score
133
- word_index += 1
134
- end
135
137
  end
138
+
139
+ # Boost score based on the phrase position in the text
140
+ if position_bonus
141
+ normalized_score = 1.0 / (phrase_index + 1)
142
+ score += normalized_score
143
+ phrase_index += 1
144
+ end
145
+
136
146
  candidates[phrase] = score
137
147
  end
138
148
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: keyphrase
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben D'Angelo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-06 00:00:00.000000000 Z
11
+ date: 2023-12-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec