keyphrase 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b93bac15e3ec17af2a33f9f4c92ffc127e6963acf520d7ef253599652662c8ac
4
- data.tar.gz: 55e9a1e431414ff360707bc5ba5319978bce1293a4eb9c83941baa3b9051f0e9
3
+ metadata.gz: 1d9478f7e5c47a826e90eae00884034ff4078262ce6b46a8f6c0a2070824b3c7
4
+ data.tar.gz: f14706384868b463ce8b968dbb4b6dd7e37bd6214d1d02b68ceb5d629930dec9
5
5
  SHA512:
6
- metadata.gz: db4181abd7629d99df0be7855dc20d4bf9cc64d1299d56973f474ab48abe940ec121063572843fc566631b32a0a69bda205152b817e8bc92f0ff8b6ce3101d91
7
- data.tar.gz: f2be99d0eca63fa493d2347c3121d2e817829ed3bc19dc62f15aaccc376dc1dca25c378d4001b259fe63a117cf4a74c4c701fc084a361f1114a0281d95643ce8
6
+ metadata.gz: 776507b887f30458a163f15999fcd0c57543b32c7acb8d604372180cc87f740a7c48d240311a9d5dbb9b0bdb0f46c5d02985332d622e10c85b2250bfd8f3af00
7
+ data.tar.gz: d408f218b406c45fe47c9315633aeb0fbe364cd5a3d597f2b77433bb3dc74289bfe8b90e5c5a187117d677b236f47763478fea4857c5619ce99cfb7f03946f65
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Keyphrase
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.1"
5
5
  end
data/lib/keyphrase.rb CHANGED
@@ -6,9 +6,10 @@ class Keyphrase
6
6
 
7
7
  autoload :Stoplist, "keyphrase/stoplist"
8
8
 
9
- CLEAN_REGEX = /([^a-zA-Z0-9'\- \.]|(?<!\w)'|(?<!\w)\.)/
10
- BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-\']+\b/
11
- SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|(?<!\w)'(?!\w)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
9
+ CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
10
+ BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-]+\b|\'/ # remove words with no letters, ie 123.23.12. And last chance to remove '
11
+ CLEAN_SPACES_REGEX = /\s+/
12
+ SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
12
13
 
13
14
  def self.analyse text, options={}
14
15
  @@keyphrase ||= Keyphrase.new
@@ -23,10 +24,11 @@ class Keyphrase
23
24
  sort = options[:sort] || true
24
25
  blacklist = options[:blacklist] || BLACKLIST_REGEX
25
26
  sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
27
+ clean_spaces_regex = options[:clean_spaces_regex] || CLEAN_SPACES_REGEX
26
28
 
27
29
  pattern = buildStopwordRegExPattern stoplist, lang
28
30
  sentences = text.split sentences_regex
29
- phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist
31
+ phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist, clean_spaces_regex
30
32
  wordscores = calculateWordScores phrases
31
33
  candidates = generateCandidateKeywordScores phrases, wordscores, position_bonus
32
34
 
@@ -61,14 +63,14 @@ class Keyphrase
61
63
 
62
64
  # generate candidate keywords
63
65
  # 2
64
- def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist
66
+ def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
65
67
  phrases = Array.new
66
68
 
67
69
  filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
68
70
 
69
71
  filtered_sentences.each do |parts|
70
72
  parts.split("|").each do |part|
71
- part = part.gsub(blacklist, " ").strip
73
+ part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip
72
74
 
73
75
  if !part.empty?
74
76
  phrases.push part
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: keyphrase
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben D'Angelo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-03 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2023-12-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.2'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.2'
13
27
  description: Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm
14
28
  in Ruby. Forked from the original rake_text gem.
15
29
  email: