keyphrase 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +74 -0
- data/Rakefile +8 -0
- data/lib/keyphrase/stoplist/eng.rb +1896 -0
- data/lib/keyphrase/stoplist.rb +21 -0
- data/lib/keyphrase/version.rb +5 -0
- data/lib/keyphrase.rb +153 -0
- data/sig/keyphrase.rbs +4 -0
- metadata +55 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
module Keyphrase::Stoplist
|
2
|
+
# Dynamically require all files in the stoplist directory
|
3
|
+
Dir[File.join(__dir__, 'stoplist', '*.rb')].each do |file|
|
4
|
+
require_relative file
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.stopwords lang, type=:smart
|
8
|
+
cl = const_get(lang.to_s.capitalize)
|
9
|
+
|
10
|
+
if type == :strict
|
11
|
+
cl.strict
|
12
|
+
else
|
13
|
+
cl.smart
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.stoplist_classes
|
18
|
+
constants.map { |const| }
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
data/lib/keyphrase.rb
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "keyphrase/version"
|
4
|
+
|
5
|
+
class Keyphrase
|
6
|
+
|
7
|
+
autoload :Stoplist, "keyphrase/stoplist"
|
8
|
+
|
9
|
+
CLEAN_REGEX = /([^a-zA-Z0-9'\- \.]|(?<!\w)'|(?<!\w)\.)/
|
10
|
+
BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-\']+\b/
|
11
|
+
SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|(?<!\w)'(?!\w)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
|
12
|
+
|
13
|
+
def self.analyse text, options={}
|
14
|
+
@@keyphrase ||= Keyphrase.new
|
15
|
+
@@keyphrase.analyse text, options
|
16
|
+
end
|
17
|
+
|
18
|
+
def analyse text, options={}
|
19
|
+
stoplist = options[:stoplist] || :smart
|
20
|
+
lang = options[:lang] || :eng
|
21
|
+
clean_regex = options[:clean] || CLEAN_REGEX
|
22
|
+
position_bonus = options[:position_bonus] || true
|
23
|
+
sort = options[:sort] || true
|
24
|
+
blacklist = options[:blacklist] || BLACKLIST_REGEX
|
25
|
+
sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
|
26
|
+
|
27
|
+
pattern = buildStopwordRegExPattern stoplist, lang
|
28
|
+
sentences = text.split sentences_regex
|
29
|
+
phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist
|
30
|
+
wordscores = calculateWordScores phrases
|
31
|
+
candidates = generateCandidateKeywordScores phrases, wordscores, position_bonus
|
32
|
+
|
33
|
+
if sort
|
34
|
+
candidates = candidates.sort_by{|k,v| -v}.to_h
|
35
|
+
end
|
36
|
+
|
37
|
+
if options[:verbose]
|
38
|
+
candidates.each do |word, score|
|
39
|
+
puts sprintf '%.2f - %s', score, word
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
return candidates
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
# create stopword pattern
|
49
|
+
# 1
|
50
|
+
def buildStopwordRegExPattern stopwords, lang
|
51
|
+
|
52
|
+
if stopwords.is_a? Symbol
|
53
|
+
# use caching
|
54
|
+
return Keyphrase::Stoplist.stopwords lang, stopwords
|
55
|
+
end
|
56
|
+
|
57
|
+
stop_regex = /(?:^|\s)(?:#{stopwords.join('|')})(?:$|\s)/io
|
58
|
+
|
59
|
+
return stop_regex
|
60
|
+
end
|
61
|
+
|
62
|
+
# generate candidate keywords
|
63
|
+
# 2
|
64
|
+
def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist
|
65
|
+
phrases = Array.new
|
66
|
+
|
67
|
+
filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
|
68
|
+
|
69
|
+
filtered_sentences.each do |parts|
|
70
|
+
parts.split("|").each do |part|
|
71
|
+
part = part.gsub(blacklist, " ").strip
|
72
|
+
|
73
|
+
if !part.empty?
|
74
|
+
phrases.push part
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
# remove duplicate keywords
|
80
|
+
phrases = phrases.uniq(&:downcase)
|
81
|
+
|
82
|
+
return phrases
|
83
|
+
end
|
84
|
+
|
85
|
+
# calculate individual word scores
|
86
|
+
# 3
|
87
|
+
def calculateWordScores phrases
|
88
|
+
word_freq = Hash.new 0
|
89
|
+
word_degree = Hash.new 0
|
90
|
+
word_score = Hash.new 0
|
91
|
+
|
92
|
+
phrases.each do |phrase|
|
93
|
+
words = seperateWords phrase
|
94
|
+
|
95
|
+
length = words.length
|
96
|
+
degree = length-1
|
97
|
+
|
98
|
+
words.each do |word|
|
99
|
+
word_freq[word] += 1
|
100
|
+
word_degree[word] += degree
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
word_freq.each do |word, counter|
|
105
|
+
word_degree[word] = word_degree[word] + word_freq[word]
|
106
|
+
end
|
107
|
+
|
108
|
+
word_freq.each do |word, counter|
|
109
|
+
word_score[word] = word_degree[word]/(word_freq[word] * 1.0)
|
110
|
+
end
|
111
|
+
|
112
|
+
return word_score
|
113
|
+
end
|
114
|
+
|
115
|
+
# generate candidate keyword scores
|
116
|
+
# 4
|
117
|
+
def generateCandidateKeywordScores phrases, scores, position_bonus
|
118
|
+
candidates = Hash.new 0
|
119
|
+
word_index = 0
|
120
|
+
|
121
|
+
phrases.each do |phrase|
|
122
|
+
words = seperateWords(phrase)
|
123
|
+
score = 0
|
124
|
+
words.each do |word|
|
125
|
+
score += scores[word]
|
126
|
+
|
127
|
+
# Normalize the score based on the position
|
128
|
+
if position_bonus
|
129
|
+
normalized_score = 1.0 / (word_index + 1)
|
130
|
+
score += normalized_score
|
131
|
+
word_index += 1
|
132
|
+
end
|
133
|
+
end
|
134
|
+
candidates[phrase] = score
|
135
|
+
end
|
136
|
+
|
137
|
+
return candidates
|
138
|
+
end
|
139
|
+
|
140
|
+
def seperateWords text
|
141
|
+
words = Array.new
|
142
|
+
|
143
|
+
text.split(/[^a-zA-Z0-9_\\+\\-\\'\\.]/).each do |word|
|
144
|
+
word = word.strip.downcase
|
145
|
+
if !word.empty? && !(true if Float(word) rescue false)
|
146
|
+
words.push word
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
return words
|
151
|
+
end
|
152
|
+
|
153
|
+
end
|
data/sig/keyphrase.rbs
ADDED
metadata
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: keyphrase
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ben D'Angelo
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-12-03 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm
|
14
|
+
in Ruby. Forked from the original rake_text gem.
|
15
|
+
email:
|
16
|
+
- ben@bendangelo.me
|
17
|
+
executables: []
|
18
|
+
extensions: []
|
19
|
+
extra_rdoc_files: []
|
20
|
+
files:
|
21
|
+
- ".rspec"
|
22
|
+
- LICENSE.txt
|
23
|
+
- README.md
|
24
|
+
- Rakefile
|
25
|
+
- lib/keyphrase.rb
|
26
|
+
- lib/keyphrase/stoplist.rb
|
27
|
+
- lib/keyphrase/stoplist/eng.rb
|
28
|
+
- lib/keyphrase/version.rb
|
29
|
+
- sig/keyphrase.rbs
|
30
|
+
homepage: https://github.com/bendangelo/keyphrase
|
31
|
+
licenses:
|
32
|
+
- MIT
|
33
|
+
metadata:
|
34
|
+
homepage_uri: https://github.com/bendangelo/keyphrase
|
35
|
+
source_code_uri: https://github.com/bendangelo/keyphrase
|
36
|
+
post_install_message:
|
37
|
+
rdoc_options: []
|
38
|
+
require_paths:
|
39
|
+
- lib
|
40
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: 2.6.0
|
45
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '0'
|
50
|
+
requirements: []
|
51
|
+
rubygems_version: 3.4.22
|
52
|
+
signing_key:
|
53
|
+
specification_version: 4
|
54
|
+
summary: Extracts keywords from texts using a stoplist and some magic.
|
55
|
+
test_files: []
|