keyphrase 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/keyphrase/version.rb +1 -1
- data/lib/keyphrase.rb +8 -6
- metadata +17 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1d9478f7e5c47a826e90eae00884034ff4078262ce6b46a8f6c0a2070824b3c7
|
4
|
+
data.tar.gz: f14706384868b463ce8b968dbb4b6dd7e37bd6214d1d02b68ceb5d629930dec9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 776507b887f30458a163f15999fcd0c57543b32c7acb8d604372180cc87f740a7c48d240311a9d5dbb9b0bdb0f46c5d02985332d622e10c85b2250bfd8f3af00
|
7
|
+
data.tar.gz: d408f218b406c45fe47c9315633aeb0fbe364cd5a3d597f2b77433bb3dc74289bfe8b90e5c5a187117d677b236f47763478fea4857c5619ce99cfb7f03946f65
|
data/lib/keyphrase/version.rb
CHANGED
data/lib/keyphrase.rb
CHANGED
@@ -6,9 +6,10 @@ class Keyphrase
|
|
6
6
|
|
7
7
|
autoload :Stoplist, "keyphrase/stoplist"
|
8
8
|
|
9
|
-
CLEAN_REGEX = /([^a-zA-Z0-9'\- \.]|(?<!\w)
|
10
|
-
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z
|
11
|
-
|
9
|
+
CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
|
10
|
+
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-]+\b|\'/ # remove words with no letters, ie 123.23.12. And last chance to remove '
|
11
|
+
CLEAN_SPACES_REGEX = /\s+/
|
12
|
+
SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
|
12
13
|
|
13
14
|
def self.analyse text, options={}
|
14
15
|
@@keyphrase ||= Keyphrase.new
|
@@ -23,10 +24,11 @@ class Keyphrase
|
|
23
24
|
sort = options[:sort] || true
|
24
25
|
blacklist = options[:blacklist] || BLACKLIST_REGEX
|
25
26
|
sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
|
27
|
+
clean_spaces_regex = options[:clean_spaces_regex] || CLEAN_SPACES_REGEX
|
26
28
|
|
27
29
|
pattern = buildStopwordRegExPattern stoplist, lang
|
28
30
|
sentences = text.split sentences_regex
|
29
|
-
phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist
|
31
|
+
phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist, clean_spaces_regex
|
30
32
|
wordscores = calculateWordScores phrases
|
31
33
|
candidates = generateCandidateKeywordScores phrases, wordscores, position_bonus
|
32
34
|
|
@@ -61,14 +63,14 @@ class Keyphrase
|
|
61
63
|
|
62
64
|
# generate candidate keywords
|
63
65
|
# 2
|
64
|
-
def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist
|
66
|
+
def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
|
65
67
|
phrases = Array.new
|
66
68
|
|
67
69
|
filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
|
68
70
|
|
69
71
|
filtered_sentences.each do |parts|
|
70
72
|
parts.split("|").each do |part|
|
71
|
-
part = part.gsub(blacklist, " ").strip
|
73
|
+
part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip
|
72
74
|
|
73
75
|
if !part.empty?
|
74
76
|
phrases.push part
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: keyphrase
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben D'Angelo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-12-
|
12
|
-
dependencies:
|
11
|
+
date: 2023-12-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '3.2'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '3.2'
|
13
27
|
description: Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm
|
14
28
|
in Ruby. Forked from the original rake_text gem.
|
15
29
|
email:
|