RubyGems - keyphrase - Versions diffs - 0.1.1 → 0.1.3 - Mend

keyphrase 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 1d9478f7e5c47a826e90eae00884034ff4078262ce6b46a8f6c0a2070824b3c7
-  data.tar.gz: f14706384868b463ce8b968dbb4b6dd7e37bd6214d1d02b68ceb5d629930dec9
+  metadata.gz: 56611558acf8336a81d11dc0b6cd1168ed0c008f48822efed61741b2448f3a65
+  data.tar.gz: 610725d8f12dbda7d041160bd98a976732f7a748644808e73afcc91f954c114d
 SHA512:
-  metadata.gz: 776507b887f30458a163f15999fcd0c57543b32c7acb8d604372180cc87f740a7c48d240311a9d5dbb9b0bdb0f46c5d02985332d622e10c85b2250bfd8f3af00
-  data.tar.gz: d408f218b406c45fe47c9315633aeb0fbe364cd5a3d597f2b77433bb3dc74289bfe8b90e5c5a187117d677b236f47763478fea4857c5619ce99cfb7f03946f65
+  metadata.gz: e996bfb9191c68a4df50b9ad52d4deaf553141c097d4e3172b9dae24246ef4098d47d8e67ef2beaa27acc6ef94e497ddb0a804307dd1d80b64cb740d9c8fb310
+  data.tar.gz: 826ae9a9d3d3f1c1ffbcb381c7be7b3f6ed8410b89846494cd490e5ecc0dfddccb74f59088ef70c3f4b4e07dd5e9ca2b4a497d3d1ec805eb11a0179753ecda54

data/lib/keyphrase/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 class Keyphrase
-  VERSION = "0.1.1"
+  VERSION = "0.1.3"
 end

data/lib/keyphrase.rb CHANGED Viewed

@@ -7,7 +7,7 @@ class Keyphrase
   autoload :Stoplist, "keyphrase/stoplist"
   CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
-  BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-]+\b|\'/ # remove words with no letters, ie 123.23.12. And last chance to remove '
+  BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
   CLEAN_SPACES_REGEX = /\s+/
   SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
@@ -66,15 +66,23 @@ class Keyphrase
   def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
     phrases = Array.new
-    filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
+    # first clean by removing unwanted special chars
+    # second remove all stop words
+    # third, remove uncaught stopwords in second pass
+    # using a | as an easy way to divide the text by stopwords
+    filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, " | ").gsub(stopwords_regex, "|") }
     filtered_sentences.each do |parts|
       parts.split("|").each do |part|
+        next if part.empty?
+        # remove blacklisted things, like 1234.45.34
+        # clean up spacing between words
         part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip
-        if !part.empty?
-          phrases.push part
-        end
+        next if part.empty?
+        phrases.push part
       end
     end
@@ -118,7 +126,7 @@ class Keyphrase
   # 4
   def generateCandidateKeywordScores phrases, scores, position_bonus
     candidates = Hash.new 0
-    word_index = 0
+    phrase_index = 0
     phrases.each do |phrase|
       words = seperateWords(phrase)
@@ -126,13 +134,15 @@ class Keyphrase
       words.each do |word|
         score += scores[word]
-        # Normalize the score based on the position
-        if position_bonus
-          normalized_score = 1.0 / (word_index + 1)
-          score += normalized_score
-          word_index += 1
-        end
       end
+      # Boost score based on the phrase position in the text
+      if position_bonus
+        normalized_score = 1.0 / (phrase_index + 1)
+        score += normalized_score
+        phrase_index += 1
+      end
       candidates[phrase] = score
     end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: keyphrase
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.1.3
 platform: ruby
 authors:
 - Ben D'Angelo
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-12-06 00:00:00.000000000 Z
+date: 2023-12-08 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec