RubyGems - rake_text - Versions diffs - 0.0.1 - Mend

rake_text 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 637b694429c8c2ea3c0ddc660154286c1baabdb7
+  data.tar.gz: 46eb07a3601c15fafba6b1cf75942850ab473445
+SHA512:
+  metadata.gz: 6b32ed586c6eac22b35ab0b41a91888ed484342be3bf3123d896a2dc532d590873584f7e9d4a2ee41615da325b6aaf27019cd71b05656b67ed6d1a653c708fff
+  data.tar.gz: 30acfaffc47aca8b9eb12e248830eeac4711570cebad713cdd596a4b908d174836f8845aab4f7b746b6d59aebf4fe011368c61e4087a09686999f7a334ec1d1e

data/lib/rake_text.rb ADDED

@@ -0,0 +1,127 @@
+class RakeText
+	@@stoplist_smart = ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","knows","known","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]
+	@@stoplist_fox = ["a","about","above","across","after","again","against","all","almost","alone","along","already","also","although","always","among","an","and","another","any","anybody","anyone","anything","anywhere","are","area","areas","around","as","ask","asked","asking","asks","at","away","b","back","backed","backing","backs","be","because","became","become","becomes","been","before","began","behind","being","beings","best","better","between","big","both","but","by","c","came","can","cannot","case","cases","certain","certainly","clear","clearly","come","could","d","did","differ","different","differently","do","does","done","down","downed","downing","downs","during","e","each","early","either","end","ended","ending","ends","enough","even","evenly","ever","every","everybody","everyone","everything","everywhere","f","face","faces","fact","facts","far","felt","few","find","finds","first","for","four","from","full","fully","further","furthered","furthering","furthers","g","gave","general","generally","get","gets","give","given","gives","go","going","good","goods","got","great","greater","greatest","group","grouped","grouping","groups","h","had","has","have","having","he","her","herself","here","high","higher","highest","him","himself","his","how","however","i","if","important","in","interest","interested","interesting","interests","into","is","it","its","itself","j","just","k","keep","keeps","kind","knew","know","known","knows","l","large","largely","last","later","latest","least","less","let","lets","like","likely","long","longer","longest","m","made","make","making","man","many","may","me","member","members","men","might","more","most","mostly","mr","mrs","much","must","my","myself","n","necessary","need","needed","needing","needs","never","new","newer","newest","next","no","non","not","nobody","noone","nothing","now","nowhere","number","numbered","numbering","numbers","o","of","off","often","old","older","oldest","on","once","one","only","open","opened","opening","opens","or","order","ordered","ordering","orders","other","others","our","out","over","p","part","parted","parting","parts","per","perhaps","place","places","point","pointed","pointing","points","possible","present","presented","presenting","presents","problem","problems","put","puts","q","quite","r","rather","really","right","room","rooms","s","said","same","saw","say","says","second","seconds","see","seem","seemed","seeming","seems","sees","several","shall","she","should","show","showed","showing","shows","side","sides","since","small","smaller","smallest","so","some","somebody","someone","something","somewhere","state","states","still","such","sure","t","take","taken","than","that","the","their","them","then","there","therefore","these","they","thing","things","think","thinks","this","those","though","thought","thoughts","three","through","thus","to","today","together","too","took","toward","turn","turned","turning","turns","two","u","under","until","up","upon","us","use","uses","used","v","very","w","want","wanted","wanting","wants","was","way","ways","we","well","wells","went","were","what","when","where","whether","which","while","who","whole","whose","why","will","with","within","without","work","worked","working","works","would","x","y","year","years","yet","you","young","younger","youngest","your","yours","z"]
+	def self.SMART
+		return @@stoplist_smart
+	end
+	def self.FOX
+		return @@stoplist_fox
+	end
+	def initialize
+	end
+	def analyse text, stoplist, verbose=false
+		pattern    = buildStopwordRegExPattern stoplist
+		sentences  = text.split(/[.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]/u)
+		phrases    = generateCandidateKeywords sentences, pattern
+		wordscores = calculateWordScores phrases
+		candidates = generateCandidateKeywordScores phrases, wordscores
+		if verbose == true
+			result = candidates.sort_by{|k,v| v}.reverse
+			result.each do |word, score|
+				puts sprintf '%.2f - %s', score, word
+			end
+		end
+		return candidates
+	end
+	private
+	# create stopword pattern
+	# 1
+	def buildStopwordRegExPattern words
+		pattern = Array.new
+		words.each do |word|
+			pattern.push '\\b'+word+'\\b'
+		end
+		return Regexp.new(pattern.join("|"), Regexp::IGNORECASE)
+	end
+	# generate candidate keywords
+	# 2
+	def generateCandidateKeywords sentences, pattern
+		phrases = Array.new
+		sentences.each do |sentence|
+			sentence = sentence.strip
+			tmp = sentence.gsub pattern, "|"
+			tmp.split("|").each do |part|
+				part = part.strip.downcase
+				if !part.empty?
+					phrases.push part
+				end
+			end
+		end
+		return phrases
+	end
+	# calculate individual word scores
+	# 3
+	def calculateWordScores phrases
+		word_freq = Hash.new 0
+		word_degree = Hash.new 0
+		word_score = Hash.new 0
+		phrases.each do |phrase|
+			words = seperateWords phrase
+			length = words.length
+			degree = length-1
+			words.each do |word|
+				word_freq[word] += 1
+				word_degree[word] += degree
+			end
+		end
+		word_freq.each do |word, counter|
+			word_degree[word] = word_degree[word] + word_freq[word]
+		end
+		word_freq.each do |word, counter|
+			word_score[word] = word_degree[word]/(word_freq[word] * 1.0)
+		end
+		return word_score
+	end
+	# generate candidate keyword scores
+	# 4
+	def generateCandidateKeywordScores phrases, scores
+		candidates = Hash.new 0
+		phrases.each do |phrase|
+			words = seperateWords(phrase)
+			score = 0
+			words.each do |word|
+				score += scores[word]
+			end
+			candidates[phrase] = score
+		end
+		return candidates
+	end
+	def seperateWords text
+		words = Array.new
+		text.split(/[^a-zA-Z0-9_\\+\\-]/).each do |word|
+			word = word.strip.downcase
+			if !word.empty? && !(true if Float(word) rescue false)
+				words.push word
+			end
+		end
+		return words
+	end
+end

metadata ADDED

@@ -0,0 +1,45 @@
+--- !ruby/object:Gem::Specification
+name: rake_text
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+platform: ruby
+authors:
+- Darius Morawiec
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2013-12-23 00:00:00.000000000 Z
+dependencies: []
+description: Implementation of the Rapid Automatic Keyword Extraction (RAKE) algorithm
+  in Ruby, a multi-word keywords extraction.
+email: github@voidplus.de
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/rake_text.rb
+homepage: https://github.com/voidplus/rake-text-ruby
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 2.0.3
+signing_key:
+specification_version: 4
+summary: Rapid Automatic Keyword Extraction
+test_files: []