keyphrase 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ module Keyphrase::Stoplist
2
+ # Dynamically require all files in the stoplist directory
3
+ Dir[File.join(__dir__, 'stoplist', '*.rb')].each do |file|
4
+ require_relative file
5
+ end
6
+
7
+ def self.stopwords lang, type=:smart
8
+ cl = const_get(lang.to_s.capitalize)
9
+
10
+ if type == :strict
11
+ cl.strict
12
+ else
13
+ cl.smart
14
+ end
15
+ end
16
+
17
+ def self.stoplist_classes
18
+ constants.map { |const| }
19
+ end
20
+
21
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ class Keyphrase
4
+ VERSION = "0.1.0"
5
+ end
data/lib/keyphrase.rb ADDED
@@ -0,0 +1,153 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "keyphrase/version"
4
+
5
+ class Keyphrase
6
+
7
+ autoload :Stoplist, "keyphrase/stoplist"
8
+
9
+ CLEAN_REGEX = /([^a-zA-Z0-9'\- \.]|(?<!\w)'|(?<!\w)\.)/
10
+ BLACKLIST_REGEX = /(?:^|\s)[^a-zA-Z\-\']+\b/
11
+ SENTENCES_REGEX = /[+!?,;:&\[\]\{\}\<\>\=\/\n\t\\"\\(\\)\u2019\u2013\|]|-(?!\w)|(?<!\w)'(?!\w)|(?<!\s)\.(?![a-zA-Z0-9])|(?<!\w)\#(?=\w)/u
12
+
13
+ def self.analyse text, options={}
14
+ @@keyphrase ||= Keyphrase.new
15
+ @@keyphrase.analyse text, options
16
+ end
17
+
18
+ def analyse text, options={}
19
+ stoplist = options[:stoplist] || :smart
20
+ lang = options[:lang] || :eng
21
+ clean_regex = options[:clean] || CLEAN_REGEX
22
+ position_bonus = options[:position_bonus] || true
23
+ sort = options[:sort] || true
24
+ blacklist = options[:blacklist] || BLACKLIST_REGEX
25
+ sentences_regex = options[:sentences_regex] || SENTENCES_REGEX
26
+
27
+ pattern = buildStopwordRegExPattern stoplist, lang
28
+ sentences = text.split sentences_regex
29
+ phrases = generateCandidateKeywords sentences, pattern, clean_regex, blacklist
30
+ wordscores = calculateWordScores phrases
31
+ candidates = generateCandidateKeywordScores phrases, wordscores, position_bonus
32
+
33
+ if sort
34
+ candidates = candidates.sort_by{|k,v| -v}.to_h
35
+ end
36
+
37
+ if options[:verbose]
38
+ candidates.each do |word, score|
39
+ puts sprintf '%.2f - %s', score, word
40
+ end
41
+ end
42
+
43
+ return candidates
44
+ end
45
+
46
+ private
47
+
48
+ # create stopword pattern
49
+ # 1
50
+ def buildStopwordRegExPattern stopwords, lang
51
+
52
+ if stopwords.is_a? Symbol
53
+ # use caching
54
+ return Keyphrase::Stoplist.stopwords lang, stopwords
55
+ end
56
+
57
+ stop_regex = /(?:^|\s)(?:#{stopwords.join('|')})(?:$|\s)/io
58
+
59
+ return stop_regex
60
+ end
61
+
62
+ # generate candidate keywords
63
+ # 2
64
+ def generateCandidateKeywords sentences, stopwords_regex, clean_regex, blacklist
65
+ phrases = Array.new
66
+
67
+ filtered_sentences = sentences.map { |sentence| sentence.gsub(clean_regex, " ").gsub(stopwords_regex, "|") }
68
+
69
+ filtered_sentences.each do |parts|
70
+ parts.split("|").each do |part|
71
+ part = part.gsub(blacklist, " ").strip
72
+
73
+ if !part.empty?
74
+ phrases.push part
75
+ end
76
+ end
77
+ end
78
+
79
+ # remove duplicate keywords
80
+ phrases = phrases.uniq(&:downcase)
81
+
82
+ return phrases
83
+ end
84
+
85
+ # calculate individual word scores
86
+ # 3
87
+ def calculateWordScores phrases
88
+ word_freq = Hash.new 0
89
+ word_degree = Hash.new 0
90
+ word_score = Hash.new 0
91
+
92
+ phrases.each do |phrase|
93
+ words = seperateWords phrase
94
+
95
+ length = words.length
96
+ degree = length-1
97
+
98
+ words.each do |word|
99
+ word_freq[word] += 1
100
+ word_degree[word] += degree
101
+ end
102
+ end
103
+
104
+ word_freq.each do |word, counter|
105
+ word_degree[word] = word_degree[word] + word_freq[word]
106
+ end
107
+
108
+ word_freq.each do |word, counter|
109
+ word_score[word] = word_degree[word]/(word_freq[word] * 1.0)
110
+ end
111
+
112
+ return word_score
113
+ end
114
+
115
+ # generate candidate keyword scores
116
+ # 4
117
+ def generateCandidateKeywordScores phrases, scores, position_bonus
118
+ candidates = Hash.new 0
119
+ word_index = 0
120
+
121
+ phrases.each do |phrase|
122
+ words = seperateWords(phrase)
123
+ score = 0
124
+ words.each do |word|
125
+ score += scores[word]
126
+
127
+ # Normalize the score based on the position
128
+ if position_bonus
129
+ normalized_score = 1.0 / (word_index + 1)
130
+ score += normalized_score
131
+ word_index += 1
132
+ end
133
+ end
134
+ candidates[phrase] = score
135
+ end
136
+
137
+ return candidates
138
+ end
139
+
140
+ def seperateWords text
141
+ words = Array.new
142
+
143
+ text.split(/[^a-zA-Z0-9_\\+\\-\\'\\.]/).each do |word|
144
+ word = word.strip.downcase
145
+ if !word.empty? && !(true if Float(word) rescue false)
146
+ words.push word
147
+ end
148
+ end
149
+
150
+ return words
151
+ end
152
+
153
+ end
data/sig/keyphrase.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Keyphrase
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,55 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: keyphrase
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ben D'Angelo
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-12-03 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm
14
+ in Ruby. Forked from the original rake_text gem.
15
+ email:
16
+ - ben@bendangelo.me
17
+ executables: []
18
+ extensions: []
19
+ extra_rdoc_files: []
20
+ files:
21
+ - ".rspec"
22
+ - LICENSE.txt
23
+ - README.md
24
+ - Rakefile
25
+ - lib/keyphrase.rb
26
+ - lib/keyphrase/stoplist.rb
27
+ - lib/keyphrase/stoplist/eng.rb
28
+ - lib/keyphrase/version.rb
29
+ - sig/keyphrase.rbs
30
+ homepage: https://github.com/bendangelo/keyphrase
31
+ licenses:
32
+ - MIT
33
+ metadata:
34
+ homepage_uri: https://github.com/bendangelo/keyphrase
35
+ source_code_uri: https://github.com/bendangelo/keyphrase
36
+ post_install_message:
37
+ rdoc_options: []
38
+ require_paths:
39
+ - lib
40
+ required_ruby_version: !ruby/object:Gem::Requirement
41
+ requirements:
42
+ - - ">="
43
+ - !ruby/object:Gem::Version
44
+ version: 2.6.0
45
+ required_rubygems_version: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: '0'
50
+ requirements: []
51
+ rubygems_version: 3.4.22
52
+ signing_key:
53
+ specification_version: 4
54
+ summary: Extracts keywords from texts using a stoplist and some magic.
55
+ test_files: []