keyphrase 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b93bac15e3ec17af2a33f9f4c92ffc127e6963acf520d7ef253599652662c8ac
4
- data.tar.gz: 55e9a1e431414ff360707bc5ba5319978bce1293a4eb9c83941baa3b9051f0e9
3
+ metadata.gz: be1cbe539ab9b72b5eeaa0f8f648853b3953f06e0cbe96e9e1870b01695bde10
4
+ data.tar.gz: 8ba868ba0e54ffca3d02ec004b8bb6ed92a2c894a583e1a50205da0f43b39cc3
5
5
  SHA512:
6
- metadata.gz: db4181abd7629d99df0be7855dc20d4bf9cc64d1299d56973f474ab48abe940ec121063572843fc566631b32a0a69bda205152b817e8bc92f0ff8b6ce3101d91
7
- data.tar.gz: f2be99d0eca63fa493d2347c3121d2e817829ed3bc19dc62f15aaccc376dc1dca25c378d4001b259fe63a117cf4a74c4c701fc084a361f1114a0281d95643ce8
6
+ metadata.gz: 19cf0a7f7752f6cf1bcba3f65687d52897dd6dc4d8fe068701cc4b847d2480dd00d851bee0cb313b9c9be736806098655a6f65ff2de341e797cf2cf45f15f685
7
+ data.tar.gz: cb5ad4e66d3413facc873691215f767cb3c9dff1af5dc301fd460dd26d64ce9b5f47c53c08520cf418eb5bc4973a2dee7e0fdd6590c9767205ee106b91a938b4
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Keyphrase
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.2"
5
5
  end
data/lib/keyphrase.rb CHANGED
@@ -6,9 +6,10 @@ class Keyphrase
6
6
 
7
7
  autoload :Stoplist, "keyphrase/stoplist"
8
8
 
9
- CLEAN_REGEX = /([^a-zA-Z0-9'\- \.]|(?<!\w)'|(?<!\w)\.)/
10
- BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-\']+\b/
11
- SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|(?<!\w)'(?!\w)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
9
+ CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
10
+ BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-]+\b|\'/ # remove words with no letters, ie 123.23.12. And last chance to remove '
11
+ CLEAN_SPACES_REGEX = /\s+/
12
+ SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
12
13
 
13
14
  def self.analyse text, options={}
14
15
  @@keyphrase ||= Keyphrase.new
@@ -23,10 +24,11 @@ class Keyphrase
23
24
  sort = options[:sort] || true
24
25
  blacklist = options[:blacklist] || BLACKLIST_REGEX
25
26
  sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
27
+ clean_spaces_regex = options[:clean_spaces_regex] || CLEAN_SPACES_REGEX
26
28
 
27
29
  pattern = buildStopwordRegExPattern stoplist, lang
28
30
  sentences = text.split sentences_regex
29
- phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist
31
+ phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist, clean_spaces_regex
30
32
  wordscores = calculateWordScores phrases
31
33
  candidates = generateCandidateKeywordScores phrases, wordscores, position_bonus
32
34
 
@@ -61,18 +63,26 @@ class Keyphrase
61
63
 
62
64
  # generate candidate keywords
63
65
  # 2
64
- def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist
66
+ def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
65
67
  phrases = Array.new
66
68
 
67
- filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
69
+ # first clean by removing unwanted special chars
70
+ # second remove all stop words
71
+ # third, remove uncaught stopwords in second pass
72
+ # using a | as an easy way to divide the text by stopwords
73
+ filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, " | ").gsub(stopwords_regex, "|") }
68
74
 
69
75
  filtered_sentences.each do |parts|
70
76
  parts.split("|").each do |part|
71
- part = part.gsub(blacklist, " ").strip
77
+ next if part.empty?
72
78
 
73
- if !part.empty?
74
- phrases.push part
75
- end
79
+ # remove blacklisted things, like 1234.45.34
80
+ # clean up spacing between words
81
+ part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip
82
+
83
+ next if part.empty?
84
+
85
+ phrases.push part
76
86
  end
77
87
  end
78
88
 
@@ -116,7 +126,7 @@ class Keyphrase
116
126
  # 4
117
127
  def generateCandidateKeywordScores phrases, scores, position_bonus
118
128
  candidates = Hash.new 0
119
- word_index = 0
129
+ phrase_index = 0
120
130
 
121
131
  phrases.each do |phrase|
122
132
  words = seperateWords(phrase)
@@ -124,13 +134,15 @@ class Keyphrase
124
134
  words.each do |word|
125
135
  score += scores[word]
126
136
 
127
- # Normalize the score based on the position
128
- if position_bonus
129
- normalized_score = 1.0 / (word_index + 1)
130
- score += normalized_score
131
- word_index += 1
132
- end
133
137
  end
138
+
139
+ # Boost score based on the phrase position in the text
140
+ if position_bonus
141
+ normalized_score = 1.0 / (phrase_index + 1)
142
+ score += normalized_score
143
+ phrase_index += 1
144
+ end
145
+
134
146
  candidates[phrase] = score
135
147
  end
136
148
 
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: keyphrase
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben D'Angelo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-03 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2023-12-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.2'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.2'
13
27
  description: Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm
14
28
  in Ruby. Forked from the original rake_text gem.
15
29
  email: