keyphrase 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,21 @@
1
+ module Keyphrase::Stoplist
2
+ # Dynamically require all files in the stoplist directory
3
+ Dir[File.join(__dir__, 'stoplist', '*.rb')].each do |file|
4
+ require_relative file
5
+ end
6
+
7
+ def self.stopwords lang, type=:smart
8
+ cl = const_get(lang.to_s.capitalize)
9
+
10
+ if type == :strict
11
+ cl.strict
12
+ else
13
+ cl.smart
14
+ end
15
+ end
16
+
17
+ def self.stoplist_classes
18
+ constants.map { |const| }
19
+ end
20
+
21
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Keyphrase
4
+ VERSION = "0.1.0"
5
+ end
data/lib/keyphrase.rb ADDED
@@ -0,0 +1,153 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "keyphrase/version"
4
+
5
+ class Keyphrase
6
+
7
+ autoload :Stoplist, "keyphrase/stoplist"
8
+
9
+ CLEAN_REGEX = /([^a-zA-Z0-9'\- \.]|(?<!\w)'|(?<!\w)\.)/
10
+ BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-\']+\b/
11
+ SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|(?<!\w)'(?!\w)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
12
+
13
+ def self.analyse text, options={}
14
+ @@keyphrase ||= Keyphrase.new
15
+ @@keyphrase.analyse text, options
16
+ end
17
+
18
+ def analyse text, options={}
19
+ stoplist = options[:stoplist] || :smart
20
+ lang = options[:lang] || :eng
21
+ clean_regex = options[:clean] || CLEAN_REGEX
22
+ position_bonus = options[:position_bonus] || true
23
+ sort = options[:sort] || true
24
+ blacklist = options[:blacklist] || BLACKLIST_REGEX
25
+ sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
26
+
27
+ pattern = buildStopwordRegExPattern stoplist, lang
28
+ sentences = text.split sentences_regex
29
+ phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist
30
+ wordscores = calculateWordScores phrases
31
+ candidates = generateCandidateKeywordScores phrases, wordscores, position_bonus
32
+
33
+ if sort
34
+ candidates = candidates.sort_by{|k,v| -v}.to_h
35
+ end
36
+
37
+ if options[:verbose]
38
+ candidates.each do |word, score|
39
+ puts sprintf '%.2f - %s', score, word
40
+ end
41
+ end
42
+
43
+ return candidates
44
+ end
45
+
46
+ private
47
+
48
+ # create stopword pattern
49
+ # 1
50
+ def buildStopwordRegExPattern stopwords, lang
51
+
52
+ if stopwords.is_a? Symbol
53
+ # use caching
54
+ return Keyphrase::Stoplist.stopwords lang, stopwords
55
+ end
56
+
57
+ stop_regex = /(?:^|\s)(?:#{stopwords.join('|')})(?:$|\s)/io
58
+
59
+ return stop_regex
60
+ end
61
+
62
+ # generate candidate keywords
63
+ # 2
64
+ def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist
65
+ phrases = Array.new
66
+
67
+ filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
68
+
69
+ filtered_sentences.each do |parts|
70
+ parts.split("|").each do |part|
71
+ part = part.gsub(blacklist, " ").strip
72
+
73
+ if !part.empty?
74
+ phrases.push part
75
+ end
76
+ end
77
+ end
78
+
79
+ # remove duplicate keywords
80
+ phrases = phrases.uniq(&:downcase)
81
+
82
+ return phrases
83
+ end
84
+
85
+ # calculate individual word scores
86
+ # 3
87
+ def calculateWordScores phrases
88
+ word_freq = Hash.new 0
89
+ word_degree = Hash.new 0
90
+ word_score = Hash.new 0
91
+
92
+ phrases.each do |phrase|
93
+ words = seperateWords phrase
94
+
95
+ length = words.length
96
+ degree = length-1
97
+
98
+ words.each do |word|
99
+ word_freq[word] += 1
100
+ word_degree[word] += degree
101
+ end
102
+ end
103
+
104
+ word_freq.each do |word, counter|
105
+ word_degree[word] = word_degree[word] + word_freq[word]
106
+ end
107
+
108
+ word_freq.each do |word, counter|
109
+ word_score[word] = word_degree[word]/(word_freq[word] * 1.0)
110
+ end
111
+
112
+ return word_score
113
+ end
114
+
115
+ # generate candidate keyword scores
116
+ # 4
117
+ def generateCandidateKeywordScores phrases, scores, position_bonus
118
+ candidates = Hash.new 0
119
+ word_index = 0
120
+
121
+ phrases.each do |phrase|
122
+ words = seperateWords(phrase)
123
+ score = 0
124
+ words.each do |word|
125
+ score += scores[word]
126
+
127
+ # Normalize the score based on the position
128
+ if position_bonus
129
+ normalized_score = 1.0 / (word_index + 1)
130
+ score += normalized_score
131
+ word_index += 1
132
+ end
133
+ end
134
+ candidates[phrase] = score
135
+ end
136
+
137
+ return candidates
138
+ end
139
+
140
+ def seperateWords text
141
+ words = Array.new
142
+
143
+ text.split(/[^a-zA-Z0-9_\\+\\-\\'\\.]/).each do |word|
144
+ word = word.strip.downcase
145
+ if !word.empty? && !(true if Float(word) rescue false)
146
+ words.push word
147
+ end
148
+ end
149
+
150
+ return words
151
+ end
152
+
153
+ end
data/sig/keyphrase.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Keyphrase
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: keyphrase
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ben D'Angelo
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-12-03 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm
14
+ in Ruby. Forked from the original rake_text gem.
15
+ email:
16
+ - ben@bendangelo.me
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".rspec"
22
+ - LICENSE.txt
23
+ - README.md
24
+ - Rakefile
25
+ - lib/keyphrase.rb
26
+ - lib/keyphrase/stoplist.rb
27
+ - lib/keyphrase/stoplist/eng.rb
28
+ - lib/keyphrase/version.rb
29
+ - sig/keyphrase.rbs
30
+ homepage: https://github.com/bendangelo/keyphrase
31
+ licenses:
32
+ - MIT
33
+ metadata:
34
+ homepage_uri: https://github.com/bendangelo/keyphrase
35
+ source_code_uri: https://github.com/bendangelo/keyphrase
36
+ post_install_message:
37
+ rdoc_options: []
38
+ require_paths:
39
+ - lib
40
+ required_ruby_version: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: 2.6.0
45
+ required_rubygems_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: '0'
50
+ requirements: []
51
+ rubygems_version: 3.4.22
52
+ signing_key:
53
+ specification_version: 4
54
+ summary: Extracts keywords from texts using a stoplist and some magic.
55
+ test_files: []