keyphrase 0.1.1 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/keyphrase/version.rb +1 -1
- data/lib/keyphrase.rb +22 -12
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 56611558acf8336a81d11dc0b6cd1168ed0c008f48822efed61741b2448f3a65
|
4
|
+
data.tar.gz: 610725d8f12dbda7d041160bd98a976732f7a748644808e73afcc91f954c114d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e996bfb9191c68a4df50b9ad52d4deaf553141c097d4e3172b9dae24246ef4098d47d8e67ef2beaa27acc6ef94e497ddb0a804307dd1d80b64cb740d9c8fb310
|
7
|
+
data.tar.gz: 826ae9a9d3d3f1c1ffbcb381c7be7b3f6ed8410b89846494cd490e5ecc0dfddccb74f59088ef70c3f4b4e07dd5e9ca2b4a497d3d1ec805eb11a0179753ecda54
|
data/lib/keyphrase/version.rb
CHANGED
data/lib/keyphrase.rb
CHANGED
@@ -7,7 +7,7 @@ class Keyphrase
|
|
7
7
|
autoload :Stoplist, "keyphrase/stoplist"
|
8
8
|
|
9
9
|
CLEAN_REGEX = /([^a-zA-Z0-9\'\- \.]|(?<!\w)\.)/ # don't remove ' because it might be part of a stop word
|
10
|
-
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z
|
10
|
+
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z]+\b|\'|\-/ # remove words with no letters, ie 123.23.12. And last chance to remove ' and -
|
11
11
|
CLEAN_SPACES_REGEX = /\s+/
|
12
12
|
SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|'(?=s)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
|
13
13
|
|
@@ -66,15 +66,23 @@ class Keyphrase
|
|
66
66
|
def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
|
67
67
|
phrases = Array.new
|
68
68
|
|
69
|
-
|
69
|
+
# first clean by removing unwanted special chars
|
70
|
+
# second remove all stop words
|
71
|
+
# third, remove uncaught stopwords in second pass
|
72
|
+
# using a | as an easy way to divide the text by stopwords
|
73
|
+
filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, " | ").gsub(stopwords_regex, "|") }
|
70
74
|
|
71
75
|
filtered_sentences.each do |parts|
|
72
76
|
parts.split("|").each do |part|
|
77
|
+
next if part.empty?
|
78
|
+
|
79
|
+
# remove blacklisted things, like 1234.45.34
|
80
|
+
# clean up spacing between words
|
73
81
|
part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip
|
74
82
|
|
75
|
-
if
|
76
|
-
|
77
|
-
|
83
|
+
next if part.empty?
|
84
|
+
|
85
|
+
phrases.push part
|
78
86
|
end
|
79
87
|
end
|
80
88
|
|
@@ -118,7 +126,7 @@ class Keyphrase
|
|
118
126
|
# 4
|
119
127
|
def generateCandidateKeywordScores phrases, scores, position_bonus
|
120
128
|
candidates = Hash.new 0
|
121
|
-
|
129
|
+
phrase_index = 0
|
122
130
|
|
123
131
|
phrases.each do |phrase|
|
124
132
|
words = seperateWords(phrase)
|
@@ -126,13 +134,15 @@ class Keyphrase
|
|
126
134
|
words.each do |word|
|
127
135
|
score += scores[word]
|
128
136
|
|
129
|
-
# Normalize the score based on the position
|
130
|
-
if position_bonus
|
131
|
-
normalized_score = 1.0 / (word_index + 1)
|
132
|
-
score += normalized_score
|
133
|
-
word_index += 1
|
134
|
-
end
|
135
137
|
end
|
138
|
+
|
139
|
+
# Boost score based on the phrase position in the text
|
140
|
+
if position_bonus
|
141
|
+
normalized_score = 1.0 / (phrase_index + 1)
|
142
|
+
score += normalized_score
|
143
|
+
phrase_index += 1
|
144
|
+
end
|
145
|
+
|
136
146
|
candidates[phrase] = score
|
137
147
|
end
|
138
148
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: keyphrase
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben D'Angelo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-12-
|
11
|
+
date: 2023-12-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|