rake_text 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/rake_text.rb +127 -0
  3. metadata +45 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 637b694429c8c2ea3c0ddc660154286c1baabdb7
4
+ data.tar.gz: 46eb07a3601c15fafba6b1cf75942850ab473445
5
+ SHA512:
6
+ metadata.gz: 6b32ed586c6eac22b35ab0b41a91888ed484342be3bf3123d896a2dc532d590873584f7e9d4a2ee41615da325b6aaf27019cd71b05656b67ed6d1a653c708fff
7
+ data.tar.gz: 30acfaffc47aca8b9eb12e248830eeac4711570cebad713cdd596a4b908d174836f8845aab4f7b746b6d59aebf4fe011368c61e4087a09686999f7a334ec1d1e
@@ -0,0 +1,127 @@
1
+ class RakeText
2
+
3
+ @@stoplist_smart = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","knows","known","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]
4
+ @@stoplist_fox = ["a","about","above","across","after","again","against","all","almost","alone","along","already","also","although","always","among","an","and","another","any","anybody","anyone","anything","anywhere","are","area","areas","around","as","ask","asked","asking","asks","at","away","b","back","backed","backing","backs","be","because","became","become","becomes","been","before","began","behind","being","beings","best","better","between","big","both","but","by","c","came","can","cannot","case","cases","certain","certainly","clear","clearly","come","could","d","did","differ","different","differently","do","does","done","down","downed","downing","downs","during","e","each","early","either","end","ended","ending","ends","enough","even","evenly","ever","every","everybody","everyone","everything","everywhere","f","face","faces","fact","facts","far","felt","few","find","finds","first","for","four","from","full","fully","further","furthered","furthering","furthers","g","gave","general","generally","get","gets","give","given","gives","go","going","good","goods","got","great","greater","greatest","group","grouped","grouping","groups","h","had","has","have","having","he","her","herself","here","high","higher","highest","him","himself","his","how","however","i","if","important","in","interest","interested","interesting","interests","into","is","it","its","itself","j","just","k","keep","keeps","kind","knew","know","known","knows","l","large","largely","last","later","latest","least","less","let","lets","like","likely","long","longer","longest","m","made","make","making","man","many","may","me","member","members","men","might","more","most","mostly","mr","mrs","much","must","my","myself","n","necessary","need","needed","needing","needs","never","new","newer","newest","next","no","non","not","nobody","noone","nothing","now","nowhere","number","numbered","numbering","numbers","o","of","off","often","old","older","oldest","on","once","one","only","open","opened","opening","opens","or","order","ordered","ordering","orders","other","others","our","out","over","p","part","parted","parting","parts","per","perhaps","place","places","point","pointed","pointing","points","possible","present","presented","presenting","presents","problem","problems","put","puts","q","quite","r","rather","really","right","room","rooms","s","said","same","saw","say","says","second","seconds","see","seem","seemed","seeming","seems","sees","several","shall","she","should","show","showed","showing","shows","side","sides","since","small","smaller","smallest","so","some","somebody","someone","something","somewhere","state","states","still","such","sure","t","take","taken","than","that","the","their","them","then","there","therefore","these","they","thing","things","think","thinks","this","those","though","thought","thoughts","three","through","thus","to","today","together","too","took","toward","turn","turned","turning","turns","two","u","under","until","up","upon","us","use","uses","used","v","very","w","want","wanted","wanting","wants","was","way","ways","we","well","wells","went","were","what","when","where","whether","which","while","who","whole","whose","why","will","with","within","without","work","worked","working","works","would","x","y","year","years","yet","you","young","younger","youngest","your","yours","z"]
5
+
6
+ def self.SMART
7
+ return @@stoplist_smart
8
+ end
9
+
10
+ def self.FOX
11
+ return @@stoplist_fox
12
+ end
13
+
14
+ def initialize
15
+ end
16
+
17
+ def analyse text, stoplist, verbose=false
18
+ pattern = buildStopwordRegExPattern stoplist
19
+ sentences = text.split(/[.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]/u)
20
+ phrases = generateCandidateKeywords sentences, pattern
21
+ wordscores = calculateWordScores phrases
22
+ candidates = generateCandidateKeywordScores phrases, wordscores
23
+
24
+ if verbose == true
25
+ result = candidates.sort_by{|k,v| v}.reverse
26
+ result.each do |word, score|
27
+ puts sprintf '%.2f - %s', score, word
28
+ end
29
+ end
30
+
31
+ return candidates
32
+ end
33
+
34
+ private
35
+
36
+ # create stopword pattern
37
+ # 1
38
+ def buildStopwordRegExPattern words
39
+ pattern = Array.new
40
+ words.each do |word|
41
+ pattern.push '\\b'+word+'\\b'
42
+ end
43
+ return Regexp.new(pattern.join("|"), Regexp::IGNORECASE)
44
+ end
45
+
46
+ # generate candidate keywords
47
+ # 2
48
+ def generateCandidateKeywords sentences, pattern
49
+ phrases = Array.new
50
+
51
+ sentences.each do |sentence|
52
+ sentence = sentence.strip
53
+
54
+ tmp = sentence.gsub pattern, "|"
55
+
56
+ tmp.split("|").each do |part|
57
+ part = part.strip.downcase
58
+ if !part.empty?
59
+ phrases.push part
60
+ end
61
+ end
62
+ end
63
+
64
+ return phrases
65
+ end
66
+
67
+ # calculate individual word scores
68
+ # 3
69
+ def calculateWordScores phrases
70
+ word_freq = Hash.new 0
71
+ word_degree = Hash.new 0
72
+ word_score = Hash.new 0
73
+
74
+ phrases.each do |phrase|
75
+ words = seperateWords phrase
76
+
77
+ length = words.length
78
+ degree = length-1
79
+
80
+ words.each do |word|
81
+ word_freq[word] += 1
82
+ word_degree[word] += degree
83
+ end
84
+ end
85
+
86
+ word_freq.each do |word, counter|
87
+ word_degree[word] = word_degree[word] + word_freq[word]
88
+ end
89
+
90
+ word_freq.each do |word, counter|
91
+ word_score[word] = word_degree[word]/(word_freq[word] * 1.0)
92
+ end
93
+
94
+ return word_score
95
+ end
96
+
97
+ # generate candidate keyword scores
98
+ # 4
99
+ def generateCandidateKeywordScores phrases, scores
100
+ candidates = Hash.new 0
101
+
102
+ phrases.each do |phrase|
103
+ words = seperateWords(phrase)
104
+ score = 0
105
+ words.each do |word|
106
+ score += scores[word]
107
+ end
108
+ candidates[phrase] = score
109
+ end
110
+
111
+ return candidates
112
+ end
113
+
114
+ def seperateWords text
115
+ words = Array.new
116
+
117
+ text.split(/[^a-zA-Z0-9_\\+\\-]/).each do |word|
118
+ word = word.strip.downcase
119
+ if !word.empty? && !(true if Float(word) rescue false)
120
+ words.push word
121
+ end
122
+ end
123
+
124
+ return words
125
+ end
126
+
127
+ end
metadata ADDED
@@ -0,0 +1,45 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rake_text
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Darius Morawiec
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-23 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm
14
+ in Ruby, a multi-word keywords extraction.
15
+ email: github@voidplus.de
16
+ executables: []
17
+ extensions: []
18
+ extra_rdoc_files: []
19
+ files:
20
+ - lib/rake_text.rb
21
+ homepage: https://github.com/voidplus/rake-text-ruby
22
+ licenses:
23
+ - MIT
24
+ metadata: {}
25
+ post_install_message:
26
+ rdoc_options: []
27
+ require_paths:
28
+ - lib
29
+ required_ruby_version: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ required_rubygems_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - '>='
37
+ - !ruby/object:Gem::Version
38
+ version: '0'
39
+ requirements: []
40
+ rubyforge_project:
41
+ rubygems_version: 2.0.3
42
+ signing_key:
43
+ specification_version: 4
44
+ summary: Rapid Automatic Keyword Extraction
45
+ test_files: []