keyphrase 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b93bac15e3ec17af2a33f9f4c92ffc127e6963acf520d7ef253599652662c8ac
4
- data.tar.gz: 55e9a1e431414ff360707bc5ba5319978bce1293a4eb9c83941baa3b9051f0e9
3
+ metadata.gz: 1d9478f7e5c47a826e90eae00884034ff4078262ce6b46a8f6c0a2070824b3c7
4
+ data.tar.gz: f14706384868b463ce8b968dbb4b6dd7e37bd6214d1d02b68ceb5d629930dec9
5
5
  SHA512:
6
- metadata.gz: db4181abd7629d99df0be7855dc20d4bf9cc64d1299d56973f474ab48abe940ec121063572843fc566631b32a0a69bda205152b817e8bc92f0ff8b6ce3101d91
7
- data.tar.gz: f2be99d0eca63fa493d2347c3121d2e817829ed3bc19dc62f15aaccc376dc1dca25c378d4001b259fe63a117cf4a74c4c701fc084a361f1114a0281d95643ce8
6
+ metadata.gz: 776507b887f30458a163f15999fcd0c57543b32c7acb8d604372180cc87f740a7c48d240311a9d5dbb9b0bdb0f46c5d02985332d622e10c85b2250bfd8f3af00
7
+ data.tar.gz: d408f218b406c45fe47c9315633aeb0fbe364cd5a3d597f2b77433bb3dc74289bfe8b90e5c5a187117d677b236f47763478fea4857c5619ce99cfb7f03946f65
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class Keyphrase
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.1"
5
5
  end
data/lib/keyphrase.rb CHANGED
@@ -6,9 +6,10 @@ class Keyphrase
6
6
 
7
7
  autoload :Stoplist, "keyphrase/stoplist"
8
8
 
9
- CLEAN_REGEX = /([^a-zA-Z0-9'\- \.]|(?<!\w)'|(?<!\w)\.)/
10
- BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-\']+\b/
11
- SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|(?<!\w)'(?!\w)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
9
+ CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
10
+ BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-]+\b|\'/ # remove words with no letters, ie 123.23.12. And last chance to remove '
11
+ CLEAN_SPACES_REGEX = /\s+/
12
+ SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
12
13
 
13
14
  def self.analyse text, options={}
14
15
  @@keyphrase ||= Keyphrase.new
@@ -23,10 +24,11 @@ class Keyphrase
23
24
  sort = options[:sort] || true
24
25
  blacklist = options[:blacklist] || BLACKLIST_REGEX
25
26
  sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
27
+ clean_spaces_regex = options[:clean_spaces_regex] || CLEAN_SPACES_REGEX
26
28
 
27
29
  pattern = buildStopwordRegExPattern stoplist, lang
28
30
  sentences = text.split sentences_regex
29
- phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist
31
+ phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist, clean_spaces_regex
30
32
  wordscores = calculateWordScores phrases
31
33
  candidates = generateCandidateKeywordScores phrases, wordscores, position_bonus
32
34
 
@@ -61,14 +63,14 @@ class Keyphrase
61
63
 
62
64
  # generate candidate keywords
63
65
  # 2
64
- def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist
66
+ def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
65
67
  phrases = Array.new
66
68
 
67
69
  filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
68
70
 
69
71
  filtered_sentences.each do |parts|
70
72
  parts.split("|").each do |part|
71
- part = part.gsub(blacklist, " ").strip
73
+ part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip
72
74
 
73
75
  if !part.empty?
74
76
  phrases.push part
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: keyphrase
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ben D'Angelo
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-12-03 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2023-12-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '3.2'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '3.2'
13
27
  description: Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm
14
28
  in Ruby. Forked from the original rake_text gem.
15
29
  email: