keyphrase 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/keyphrase/version.rb +1 -1
- data/lib/keyphrase.rb +29 -17
- metadata +17 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: be1cbe539ab9b72b5eeaa0f8f648853b3953f06e0cbe96e9e1870b01695bde10
|
4
|
+
data.tar.gz: 8ba868ba0e54ffca3d02ec004b8bb6ed92a2c894a583e1a50205da0f43b39cc3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 19cf0a7f7752f6cf1bcba3f65687d52897dd6dc4d8fe068701cc4b847d2480dd00d851bee0cb313b9c9be736806098655a6f65ff2de341e797cf2cf45f15f685
|
7
|
+
data.tar.gz: cb5ad4e66d3413facc873691215f767cb3c9dff1af5dc301fd460dd26d64ce9b5f47c53c08520cf418eb5bc4973a2dee7e0fdd6590c9767205ee106b91a938b4
|
data/lib/keyphrase/version.rb
CHANGED
data/lib/keyphrase.rb
CHANGED
@@ -6,9 +6,10 @@ class Keyphrase
|
|
6
6
|
|
7
7
|
autoload :Stoplist, "keyphrase/stoplist"
|
8
8
|
|
9
|
-
CLEAN_REGEX = /([^a-zA-Z0-9'\- \.]|(?<!\w)
|
10
|
-
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z
|
11
|
-
|
9
|
+
CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
|
10
|
+
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-]+\b|\'/ # remove words with no letters, ie 123.23.12. And last chance to remove '
|
11
|
+
CLEAN_SPACES_REGEX = /\s+/
|
12
|
+
SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
|
12
13
|
|
13
14
|
def self.analyse text, options={}
|
14
15
|
@@keyphrase ||= Keyphrase.new
|
@@ -23,10 +24,11 @@ class Keyphrase
|
|
23
24
|
sort = options[:sort] || true
|
24
25
|
blacklist = options[:blacklist] || BLACKLIST_REGEX
|
25
26
|
sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
|
27
|
+
clean_spaces_regex = options[:clean_spaces_regex] || CLEAN_SPACES_REGEX
|
26
28
|
|
27
29
|
pattern = buildStopwordRegExPattern stoplist, lang
|
28
30
|
sentences = text.split sentences_regex
|
29
|
-
phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist
|
31
|
+
phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist, clean_spaces_regex
|
30
32
|
wordscores = calculateWordScores phrases
|
31
33
|
candidates = generateCandidateKeywordScores phrases, wordscores, position_bonus
|
32
34
|
|
@@ -61,18 +63,26 @@ class Keyphrase
|
|
61
63
|
|
62
64
|
# generate candidate keywords
|
63
65
|
# 2
|
64
|
-
def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist
|
66
|
+
def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
|
65
67
|
phrases = Array.new
|
66
68
|
|
67
|
-
|
69
|
+
# first clean by removing unwanted special chars
|
70
|
+
# second remove all stop words
|
71
|
+
# third, remove uncaught stopwords in second pass
|
72
|
+
# using a | as an easy way to divide the text by stopwords
|
73
|
+
filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, " | ").gsub(stopwords_regex, "|") }
|
68
74
|
|
69
75
|
filtered_sentences.each do |parts|
|
70
76
|
parts.split("|").each do |part|
|
71
|
-
|
77
|
+
next if part.empty?
|
72
78
|
|
73
|
-
|
74
|
-
|
75
|
-
|
79
|
+
# remove blacklisted things, like 1234.45.34
|
80
|
+
# clean up spacing between words
|
81
|
+
part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip
|
82
|
+
|
83
|
+
next if part.empty?
|
84
|
+
|
85
|
+
phrases.push part
|
76
86
|
end
|
77
87
|
end
|
78
88
|
|
@@ -116,7 +126,7 @@ class Keyphrase
|
|
116
126
|
# 4
|
117
127
|
def generateCandidateKeywordScores phrases, scores, position_bonus
|
118
128
|
candidates = Hash.new 0
|
119
|
-
|
129
|
+
phrase_index = 0
|
120
130
|
|
121
131
|
phrases.each do |phrase|
|
122
132
|
words = seperateWords(phrase)
|
@@ -124,13 +134,15 @@ class Keyphrase
|
|
124
134
|
words.each do |word|
|
125
135
|
score += scores[word]
|
126
136
|
|
127
|
-
# Normalize the score based on the position
|
128
|
-
if position_bonus
|
129
|
-
normalized_score = 1.0 / (word_index + 1)
|
130
|
-
score += normalized_score
|
131
|
-
word_index += 1
|
132
|
-
end
|
133
137
|
end
|
138
|
+
|
139
|
+
# Boost score based on the phrase position in the text
|
140
|
+
if position_bonus
|
141
|
+
normalized_score = 1.0 / (phrase_index + 1)
|
142
|
+
score += normalized_score
|
143
|
+
phrase_index += 1
|
144
|
+
end
|
145
|
+
|
134
146
|
candidates[phrase] = score
|
135
147
|
end
|
136
148
|
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: keyphrase
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben D'Angelo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-12-
|
12
|
-
dependencies:
|
11
|
+
date: 2023-12-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '3.2'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '3.2'
|
13
27
|
description: Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm
|
14
28
|
in Ruby. Forked from the original rake_text gem.
|
15
29
|
email:
|