keyphrase 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +74 -0
- data/Rakefile +8 -0
- data/lib/keyphrase/stoplist/eng.rb +1896 -0
- data/lib/keyphrase/stoplist.rb +21 -0
- data/lib/keyphrase/version.rb +5 -0
- data/lib/keyphrase.rb +153 -0
- data/sig/keyphrase.rbs +4 -0
- metadata +55 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
module Keyphrase::Stoplist
|
2
|
+
# Dynamically require all files in the stoplist directory
|
3
|
+
Dir[File.join(__dir__, 'stoplist', '*.rb')].each do |file|
|
4
|
+
require_relative file
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.stopwords lang, type=:smart
|
8
|
+
cl = const_get(lang.to_s.capitalize)
|
9
|
+
|
10
|
+
if type == :strict
|
11
|
+
cl.strict
|
12
|
+
else
|
13
|
+
cl.smart
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.stoplist_classes
|
18
|
+
constants.map { |const| }
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
data/lib/keyphrase.rb
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "keyphrase/version"
|
4
|
+
|
5
|
+
class Keyphrase
|
6
|
+
|
7
|
+
autoload :Stoplist, "keyphrase/stoplist"
|
8
|
+
|
9
|
+
CLEAN_REGEX = /([^a-zA-Z0-9'\- \.]|(?<!\w)'|(?<!\w)\.)/
|
10
|
+
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-\']+\b/
|
11
|
+
SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|(?<!\w)'(?!\w)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
|
12
|
+
|
13
|
+
def self.analyse text, options={}
|
14
|
+
@@keyphrase ||= Keyphrase.new
|
15
|
+
@@keyphrase.analyse text, options
|
16
|
+
end
|
17
|
+
|
18
|
+
def analyse text, options={}
|
19
|
+
stoplist = options[:stoplist] || :smart
|
20
|
+
lang = options[:lang] || :eng
|
21
|
+
clean_regex = options[:clean] || CLEAN_REGEX
|
22
|
+
position_bonus = options[:position_bonus] || true
|
23
|
+
sort = options[:sort] || true
|
24
|
+
blacklist = options[:blacklist] || BLACKLIST_REGEX
|
25
|
+
sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
|
26
|
+
|
27
|
+
pattern = buildStopwordRegExPattern stoplist, lang
|
28
|
+
sentences = text.split sentences_regex
|
29
|
+
phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist
|
30
|
+
wordscores = calculateWordScores phrases
|
31
|
+
candidates = generateCandidateKeywordScores phrases, wordscores, position_bonus
|
32
|
+
|
33
|
+
if sort
|
34
|
+
candidates = candidates.sort_by{|k,v| -v}.to_h
|
35
|
+
end
|
36
|
+
|
37
|
+
if options[:verbose]
|
38
|
+
candidates.each do |word, score|
|
39
|
+
puts sprintf '%.2f - %s', score, word
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
return candidates
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
# create stopword pattern
|
49
|
+
# 1
|
50
|
+
def buildStopwordRegExPattern stopwords, lang
|
51
|
+
|
52
|
+
if stopwords.is_a? Symbol
|
53
|
+
# use caching
|
54
|
+
return Keyphrase::Stoplist.stopwords lang, stopwords
|
55
|
+
end
|
56
|
+
|
57
|
+
stop_regex = /(?:^|\s)(?:#{stopwords.join('|')})(?:$|\s)/io
|
58
|
+
|
59
|
+
return stop_regex
|
60
|
+
end
|
61
|
+
|
62
|
+
# generate candidate keywords
|
63
|
+
# 2
|
64
|
+
def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist
|
65
|
+
phrases = Array.new
|
66
|
+
|
67
|
+
filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
|
68
|
+
|
69
|
+
filtered_sentences.each do |parts|
|
70
|
+
parts.split("|").each do |part|
|
71
|
+
part = part.gsub(blacklist, " ").strip
|
72
|
+
|
73
|
+
if !part.empty?
|
74
|
+
phrases.push part
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# remove duplicate keywords
|
80
|
+
phrases = phrases.uniq(&:downcase)
|
81
|
+
|
82
|
+
return phrases
|
83
|
+
end
|
84
|
+
|
85
|
+
# calculate individual word scores
|
86
|
+
# 3
|
87
|
+
def calculateWordScores phrases
|
88
|
+
word_freq = Hash.new 0
|
89
|
+
word_degree = Hash.new 0
|
90
|
+
word_score = Hash.new 0
|
91
|
+
|
92
|
+
phrases.each do |phrase|
|
93
|
+
words = seperateWords phrase
|
94
|
+
|
95
|
+
length = words.length
|
96
|
+
degree = length-1
|
97
|
+
|
98
|
+
words.each do |word|
|
99
|
+
word_freq[word] += 1
|
100
|
+
word_degree[word] += degree
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
word_freq.each do |word, counter|
|
105
|
+
word_degree[word] = word_degree[word] + word_freq[word]
|
106
|
+
end
|
107
|
+
|
108
|
+
word_freq.each do |word, counter|
|
109
|
+
word_score[word] = word_degree[word]/(word_freq[word] * 1.0)
|
110
|
+
end
|
111
|
+
|
112
|
+
return word_score
|
113
|
+
end
|
114
|
+
|
115
|
+
# generate candidate keyword scores
|
116
|
+
# 4
|
117
|
+
def generateCandidateKeywordScores phrases, scores, position_bonus
|
118
|
+
candidates = Hash.new 0
|
119
|
+
word_index = 0
|
120
|
+
|
121
|
+
phrases.each do |phrase|
|
122
|
+
words = seperateWords(phrase)
|
123
|
+
score = 0
|
124
|
+
words.each do |word|
|
125
|
+
score += scores[word]
|
126
|
+
|
127
|
+
# Normalize the score based on the position
|
128
|
+
if position_bonus
|
129
|
+
normalized_score = 1.0 / (word_index + 1)
|
130
|
+
score += normalized_score
|
131
|
+
word_index += 1
|
132
|
+
end
|
133
|
+
end
|
134
|
+
candidates[phrase] = score
|
135
|
+
end
|
136
|
+
|
137
|
+
return candidates
|
138
|
+
end
|
139
|
+
|
140
|
+
def seperateWords text
|
141
|
+
words = Array.new
|
142
|
+
|
143
|
+
text.split(/[^a-zA-Z0-9_\\+\\-\\'\\.]/).each do |word|
|
144
|
+
word = word.strip.downcase
|
145
|
+
if !word.empty? && !(true if Float(word) rescue false)
|
146
|
+
words.push word
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
return words
|
151
|
+
end
|
152
|
+
|
153
|
+
end
|
data/sig/keyphrase.rbs
ADDED
metadata
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: keyphrase
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ben D'Angelo
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-12-03 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm
|
14
|
+
in Ruby. Forked from the original rake_text gem.
|
15
|
+
email:
|
16
|
+
- ben@bendangelo.me
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- ".rspec"
|
22
|
+
- LICENSE.txt
|
23
|
+
- README.md
|
24
|
+
- Rakefile
|
25
|
+
- lib/keyphrase.rb
|
26
|
+
- lib/keyphrase/stoplist.rb
|
27
|
+
- lib/keyphrase/stoplist/eng.rb
|
28
|
+
- lib/keyphrase/version.rb
|
29
|
+
- sig/keyphrase.rbs
|
30
|
+
homepage: https://github.com/bendangelo/keyphrase
|
31
|
+
licenses:
|
32
|
+
- MIT
|
33
|
+
metadata:
|
34
|
+
homepage_uri: https://github.com/bendangelo/keyphrase
|
35
|
+
source_code_uri: https://github.com/bendangelo/keyphrase
|
36
|
+
post_install_message:
|
37
|
+
rdoc_options: []
|
38
|
+
require_paths:
|
39
|
+
- lib
|
40
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: 2.6.0
|
45
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '0'
|
50
|
+
requirements: []
|
51
|
+
rubygems_version: 3.4.22
|
52
|
+
signing_key:
|
53
|
+
specification_version: 4
|
54
|
+
summary: Extracts keywords from texts using a stoplist and some magic.
|
55
|
+
test_files: []
|