rake_text 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/rake_text.rb +127 -0
- metadata +45 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 637b694429c8c2ea3c0ddc660154286c1baabdb7
|
4
|
+
data.tar.gz: 46eb07a3601c15fafba6b1cf75942850ab473445
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 6b32ed586c6eac22b35ab0b41a91888ed484342be3bf3123d896a2dc532d590873584f7e9d4a2ee41615da325b6aaf27019cd71b05656b67ed6d1a653c708fff
|
7
|
+
data.tar.gz: 30acfaffc47aca8b9eb12e248830eeac4711570cebad713cdd596a4b908d174836f8845aab4f7b746b6d59aebf4fe011368c61e4087a09686999f7a334ec1d1e
|
data/lib/rake_text.rb
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
class RakeText
|
2
|
+
|
3
|
+
@@stoplist_smart = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","knows","known","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]
|
4
|
+
@@stoplist_fox = ["a","about","above","across","after","again","against","all","almost","alone","along","already","also","although","always","among","an","and","another","any","anybody","anyone","anything","anywhere","are","area","areas","around","as","ask","asked","asking","asks","at","away","b","back","backed","backing","backs","be","because","became","become","becomes","been","before","began","behind","being","beings","best","better","between","big","both","but","by","c","came","can","cannot","case","cases","certain","certainly","clear","clearly","come","could","d","did","differ","different","differently","do","does","done","down","downed","downing","downs","during","e","each","early","either","end","ended","ending","ends","enough","even","evenly","ever","every","everybody","everyone","everything","everywhere","f","face","faces","fact","facts","far","felt","few","find","finds","first","for","four","from","full","fully","further","furthered","furthering","furthers","g","gave","general","generally","get","gets","give","given","gives","go","going","good","goods","got","great","greater","greatest","group","grouped","grouping","groups","h","had","has","have","having","he","her","herself","here","high","higher","highest","him","himself","his","how","however","i","if","important","in","interest","interested","interesting","interests","into","is","it","its","itself","j","just","k","keep","keeps","kind","knew","know","known","knows","l","large","largely","last","later","latest","least","less","let","lets","like","likely","long","longer","longest","m","made","make","making","man","many","may","me","member","members","men","might","more","most","mostly","mr","mrs","much","must","my","myself","n","necessary","need","needed","needing","needs","never","new","newer","newest","next","no","non","not","nobody","noone","nothing","now","nowhere","number","numbered","numbering","numbers","o","of","off","often","old","older","oldest","on","once","one","only","open","opened","opening","opens","or","order","ordered","ordering","orders","other","others","our","out","over","p","part","parted","parting","parts","per","perhaps","place","places","point","pointed","pointing","points","possible","present","presented","presenting","presents","problem","problems","put","puts","q","quite","r","rather","really","right","room","rooms","s","said","same","saw","say","says","second","seconds","see","seem","seemed","seeming","seems","sees","several","shall","she","should","show","showed","showing","shows","side","sides","since","small","smaller","smallest","so","some","somebody","someone","something","somewhere","state","states","still","such","sure","t","take","taken","than","that","the","their","them","then","there","therefore","these","they","thing","things","think","thinks","this","those","though","thought","thoughts","three","through","thus","to","today","together","too","took","toward","turn","turned","turning","turns","two","u","under","until","up","upon","us","use","uses","used","v","very","w","want","wanted","wanting","wants","was","way","ways","we","well","wells","went","were","what","when","where","whether","which","while","who","whole","whose","why","will","with","within","without","work","worked","working","works","would","x","y","year","years","yet","you","young","younger","youngest","your","yours","z"]
|
5
|
+
|
6
|
+
def self.SMART
|
7
|
+
return @@stoplist_smart
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.FOX
|
11
|
+
return @@stoplist_fox
|
12
|
+
end
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
end
|
16
|
+
|
17
|
+
def analyse text, stoplist, verbose=false
|
18
|
+
pattern = buildStopwordRegExPattern stoplist
|
19
|
+
sentences = text.split(/[.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]/u)
|
20
|
+
phrases = generateCandidateKeywords sentences, pattern
|
21
|
+
wordscores = calculateWordScores phrases
|
22
|
+
candidates = generateCandidateKeywordScores phrases, wordscores
|
23
|
+
|
24
|
+
if verbose == true
|
25
|
+
result = candidates.sort_by{|k,v| v}.reverse
|
26
|
+
result.each do |word, score|
|
27
|
+
puts sprintf '%.2f - %s', score, word
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
return candidates
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
# create stopword pattern
|
37
|
+
# 1
|
38
|
+
def buildStopwordRegExPattern words
|
39
|
+
pattern = Array.new
|
40
|
+
words.each do |word|
|
41
|
+
pattern.push '\\b'+word+'\\b'
|
42
|
+
end
|
43
|
+
return Regexp.new(pattern.join("|"), Regexp::IGNORECASE)
|
44
|
+
end
|
45
|
+
|
46
|
+
# generate candidate keywords
|
47
|
+
# 2
|
48
|
+
def generateCandidateKeywords sentences, pattern
|
49
|
+
phrases = Array.new
|
50
|
+
|
51
|
+
sentences.each do |sentence|
|
52
|
+
sentence = sentence.strip
|
53
|
+
|
54
|
+
tmp = sentence.gsub pattern, "|"
|
55
|
+
|
56
|
+
tmp.split("|").each do |part|
|
57
|
+
part = part.strip.downcase
|
58
|
+
if !part.empty?
|
59
|
+
phrases.push part
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
return phrases
|
65
|
+
end
|
66
|
+
|
67
|
+
# calculate individual word scores
|
68
|
+
# 3
|
69
|
+
def calculateWordScores phrases
|
70
|
+
word_freq = Hash.new 0
|
71
|
+
word_degree = Hash.new 0
|
72
|
+
word_score = Hash.new 0
|
73
|
+
|
74
|
+
phrases.each do |phrase|
|
75
|
+
words = seperateWords phrase
|
76
|
+
|
77
|
+
length = words.length
|
78
|
+
degree = length-1
|
79
|
+
|
80
|
+
words.each do |word|
|
81
|
+
word_freq[word] += 1
|
82
|
+
word_degree[word] += degree
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
word_freq.each do |word, counter|
|
87
|
+
word_degree[word] = word_degree[word] + word_freq[word]
|
88
|
+
end
|
89
|
+
|
90
|
+
word_freq.each do |word, counter|
|
91
|
+
word_score[word] = word_degree[word]/(word_freq[word] * 1.0)
|
92
|
+
end
|
93
|
+
|
94
|
+
return word_score
|
95
|
+
end
|
96
|
+
|
97
|
+
# generate candidate keyword scores
|
98
|
+
# 4
|
99
|
+
def generateCandidateKeywordScores phrases, scores
|
100
|
+
candidates = Hash.new 0
|
101
|
+
|
102
|
+
phrases.each do |phrase|
|
103
|
+
words = seperateWords(phrase)
|
104
|
+
score = 0
|
105
|
+
words.each do |word|
|
106
|
+
score += scores[word]
|
107
|
+
end
|
108
|
+
candidates[phrase] = score
|
109
|
+
end
|
110
|
+
|
111
|
+
return candidates
|
112
|
+
end
|
113
|
+
|
114
|
+
def seperateWords text
|
115
|
+
words = Array.new
|
116
|
+
|
117
|
+
text.split(/[^a-zA-Z0-9_\\+\\-]/).each do |word|
|
118
|
+
word = word.strip.downcase
|
119
|
+
if !word.empty? && !(true if Float(word) rescue false)
|
120
|
+
words.push word
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
return words
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|
metadata
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rake_text
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Darius Morawiec
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-12-23 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm
|
14
|
+
in Ruby, a multi-word keywords extraction.
|
15
|
+
email: github@voidplus.de
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- lib/rake_text.rb
|
21
|
+
homepage: https://github.com/voidplus/rake-text-ruby
|
22
|
+
licenses:
|
23
|
+
- MIT
|
24
|
+
metadata: {}
|
25
|
+
post_install_message:
|
26
|
+
rdoc_options: []
|
27
|
+
require_paths:
|
28
|
+
- lib
|
29
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: '0'
|
39
|
+
requirements: []
|
40
|
+
rubyforge_project:
|
41
|
+
rubygems_version: 2.0.3
|
42
|
+
signing_key:
|
43
|
+
specification_version: 4
|
44
|
+
summary: Rapid Automatic Keyword Extraction
|
45
|
+
test_files: []
|