RubyGems - keyphrase - Versions diffs - 0.1.0 → 0.1.2 - Mend

keyphrase 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: b93bac15e3ec17af2a33f9f4c92ffc127e6963acf520d7ef253599652662c8ac
-  data.tar.gz: 55e9a1e431414ff360707bc5ba5319978bce1293a4eb9c83941baa3b9051f0e9
+  metadata.gz: be1cbe539ab9b72b5eeaa0f8f648853b3953f06e0cbe96e9e1870b01695bde10
+  data.tar.gz: 8ba868ba0e54ffca3d02ec004b8bb6ed92a2c894a583e1a50205da0f43b39cc3
 SHA512:
-  metadata.gz: db4181abd7629d99df0be7855dc20d4bf9cc64d1299d56973f474ab48abe940ec121063572843fc566631b32a0a69bda205152b817e8bc92f0ff8b6ce3101d91
-  data.tar.gz: f2be99d0eca63fa493d2347c3121d2e817829ed3bc19dc62f15aaccc376dc1dca25c378d4001b259fe63a117cf4a74c4c701fc084a361f1114a0281d95643ce8
+  metadata.gz: 19cf0a7f7752f6cf1bcba3f65687d52897dd6dc4d8fe068701cc4b847d2480dd00d851bee0cb313b9c9be736806098655a6f65ff2de341e797cf2cf45f15f685
+  data.tar.gz: cb5ad4e66d3413facc873691215f767cb3c9dff1af5dc301fd460dd26d64ce9b5f47c53c08520cf418eb5bc4973a2dee7e0fdd6590c9767205ee106b91a938b4

data/lib/keyphrase/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 class Keyphrase
-  VERSION = "0.1.0"
+  VERSION = "0.1.2"
 end

data/lib/keyphrase.rb CHANGED Viewed

@@ -6,9 +6,10 @@ class Keyphrase
   autoload :Stoplist, "keyphrase/stoplist"
-  CLEAN_REGEX = /([^a-zA-Z0-9'\- \.]|(?<!\w)'|(?<!\w)\.)/
-  BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-\']+\b/
-  SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|(?<!\w)'(?!\w)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
+  CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
+  BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-]+\b|\'/ # remove words with no letters, ie 123.23.12. And last chance to remove '
+  CLEAN_SPACES_REGEX = /\s+/
+  SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
   def self.analyse text, options={}
     @@keyphrase ||= Keyphrase.new
@@ -23,10 +24,11 @@ class Keyphrase
     sort = options[:sort] || true
     blacklist = options[:blacklist] || BLACKLIST_REGEX
     sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
+    clean_spaces_regex = options[:clean_spaces_regex] || CLEAN_SPACES_REGEX
     pattern    = buildStopwordRegExPattern stoplist, lang
     sentences  = text.split sentences_regex
-    phrases    = generateCandidateKeywords sentences, pattern, clean_regex, blacklist
+    phrases    = generateCandidateKeywords sentences, pattern, clean_regex, blacklist, clean_spaces_regex
     wordscores = calculateWordScores phrases
     candidates = generateCandidateKeywordScores phrases, wordscores, position_bonus
@@ -61,18 +63,26 @@ class Keyphrase
   # generate candidate keywords
   # 2
-  def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist
+  def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
     phrases = Array.new
-    filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
+    # first clean by removing unwanted special chars
+    # second remove all stop words
+    # third, remove uncaught stopwords in second pass
+    # using a | as an easy way to divide the text by stopwords
+    filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, " | ").gsub(stopwords_regex, "|") }
     filtered_sentences.each do |parts|
       parts.split("|").each do |part|
-        part = part.gsub(blacklist, " ").strip
+        next if part.empty?
-        if !part.empty?
-          phrases.push part
-        end
+        # remove blacklisted things, like 1234.45.34
+        # clean up spacing between words
+        part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip
+        next if part.empty?
+        phrases.push part
       end
     end
@@ -116,7 +126,7 @@ class Keyphrase
   # 4
   def generateCandidateKeywordScores phrases, scores, position_bonus
     candidates = Hash.new 0
-    word_index = 0
+    phrase_index = 0
     phrases.each do |phrase|
       words = seperateWords(phrase)
@@ -124,13 +134,15 @@ class Keyphrase
       words.each do |word|
         score += scores[word]
-        # Normalize the score based on the position
-        if position_bonus
-          normalized_score = 1.0 / (word_index + 1)
-          score += normalized_score
-          word_index += 1
-        end
       end
+      # Boost score based on the phrase position in the text
+      if position_bonus
+        normalized_score = 1.0 / (phrase_index + 1)
+        score += normalized_score
+        phrase_index += 1
+      end
       candidates[phrase] = score
     end

metadata CHANGED Viewed

@@ -1,15 +1,29 @@
 --- !ruby/object:Gem::Specification
 name: keyphrase
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.2
 platform: ruby
 authors:
 - Ben D'Angelo
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-12-03 00:00:00.000000000 Z
-dependencies: []
+date: 2023-12-07 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.2'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '3.2'
 description: Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm
   in Ruby. Forked from the original rake_text gem.
 email: