keyphrase 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/keyphrase/version.rb +1 -1
- data/lib/keyphrase.rb +21 -11
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: be1cbe539ab9b72b5eeaa0f8f648853b3953f06e0cbe96e9e1870b01695bde10
|
4
|
+
data.tar.gz: 8ba868ba0e54ffca3d02ec004b8bb6ed92a2c894a583e1a50205da0f43b39cc3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 19cf0a7f7752f6cf1bcba3f65687d52897dd6dc4d8fe068701cc4b847d2480dd00d851bee0cb313b9c9be736806098655a6f65ff2de341e797cf2cf45f15f685
|
7
|
+
data.tar.gz: cb5ad4e66d3413facc873691215f767cb3c9dff1af5dc301fd460dd26d64ce9b5f47c53c08520cf418eb5bc4973a2dee7e0fdd6590c9767205ee106b91a938b4
|
data/lib/keyphrase/version.rb
CHANGED
data/lib/keyphrase.rb
CHANGED
@@ -66,15 +66,23 @@ class Keyphrase
|
|
66
66
|
def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist, clean_spaces_regex
|
67
67
|
phrases = Array.new
|
68
68
|
|
69
|
-
|
69
|
+
# first clean by removing unwanted special chars
|
70
|
+
# second remove all stop words
|
71
|
+
# third, remove uncaught stopwords in second pass
|
72
|
+
# using a | as an easy way to divide the text by stopwords
|
73
|
+
filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, " | ").gsub(stopwords_regex, "|") }
|
70
74
|
|
71
75
|
filtered_sentences.each do |parts|
|
72
76
|
parts.split("|").each do |part|
|
77
|
+
next if part.empty?
|
78
|
+
|
79
|
+
# remove blacklisted things, like 1234.45.34
|
80
|
+
# clean up spacing between words
|
73
81
|
part = part.gsub(blacklist, " ").gsub(clean_spaces_regex, " ").strip
|
74
82
|
|
75
|
-
if
|
76
|
-
|
77
|
-
|
83
|
+
next if part.empty?
|
84
|
+
|
85
|
+
phrases.push part
|
78
86
|
end
|
79
87
|
end
|
80
88
|
|
@@ -118,7 +126,7 @@ class Keyphrase
|
|
118
126
|
# 4
|
119
127
|
def generateCandidateKeywordScores phrases, scores, position_bonus
|
120
128
|
candidates = Hash.new 0
|
121
|
-
|
129
|
+
phrase_index = 0
|
122
130
|
|
123
131
|
phrases.each do |phrase|
|
124
132
|
words = seperateWords(phrase)
|
@@ -126,13 +134,15 @@ class Keyphrase
|
|
126
134
|
words.each do |word|
|
127
135
|
score += scores[word]
|
128
136
|
|
129
|
-
# Normalize the score based on the position
|
130
|
-
if position_bonus
|
131
|
-
normalized_score = 1.0 / (word_index + 1)
|
132
|
-
score += normalized_score
|
133
|
-
word_index += 1
|
134
|
-
end
|
135
137
|
end
|
138
|
+
|
139
|
+
# Boost score based on the phrase position in the text
|
140
|
+
if position_bonus
|
141
|
+
normalized_score = 1.0 / (phrase_index + 1)
|
142
|
+
score += normalized_score
|
143
|
+
phrase_index += 1
|
144
|
+
end
|
145
|
+
|
136
146
|
candidates[phrase] = score
|
137
147
|
end
|
138
148
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: keyphrase
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ben D'Angelo
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-12-
|
11
|
+
date: 2023-12-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|