RubyGems - WebWordSorter - Versions diffs - 0.1.2 - Mend

WebWordSorter 0.1.2

Files changed (6) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: bbb277b659ed1e9907e2223f3384bed97240aa4e
+  data.tar.gz: 253a40ff16ee7810edb4f61f2765fdcc7577685b
+SHA512:
+  metadata.gz: e2dd2461f42c708a90540bc0e02b615c59f27901717e1b8954e0e6807aaef0122a89f5a29eb7c13df25b871192e4d84b3e7ce1fee1ba896354f8ab187e83a1e5
+  data.tar.gz: b258421a958485c0079628a5fc6b8f0cee29ba25bab65a78d2022a4ea4f46b28e864f7cf711113055b6646ce6fba40c5fb7f4870c09053bb83ab39ecac9f585d

data/lib/WebWordSorter.rb ADDED Viewed

@@ -0,0 +1,249 @@
+require 'nokogiri'
+require 'anemone'
+require 'open-uri'
+require 'faraday'
+require 'pdf/reader'
+require 'rubygems'
+require 'ruby-progressbar'
+ # Author::    John Z. Abrams  (mailto:jzabrams@unm.edu)
+ # Copyright:: Copyright (c) 2014 John Z. Abrams
+ # License::   Distributed under MIT License
+##
+# == Description
+# This class contains all the necessary methods to do the following:
+# 	-Crawl most websites and return an array of their URLS.
+# 	-Convert all HTML and most web linked PDF documents to one large string given an array of urls.
+# 	-Parse out all non words and non human sensible markup.
+# 	-Stooge Sort an array of words via Iteration, <b>NOT recursion. NOTE: Current verison
+#	 IS using recursive stooge sort!</b>
+# 	-Write array the words to file.
+#
+# When used in properly this class will take all unique words on a website, parse them, sort them, and write them to a .txt file.
+#
+class WebWordSorter
+	##
+	# === Description
+	# Use anemone gem to crawl each page of a given site and store each url to
+	# and array which is returned.
+	#
+	# ==== Attribute
+	# * +url_base+ - Base URL for the crawler to start
+	# ==== Returns
+	# * +page_urls+ - An array that contains all of the linked urls for a given domain
+    # ==== Example
+    # 		base = WebWordSorter.new
+    # 		base.crawler("http://www.example.com")
+	def crawler (url_base)
+		pg_crawl = ProgressBar.create(:title => "Crawling: #{url_base}", :starting_at => 20, :total => nil, :format => '%a  %t')
+        page_urls = Array.new
+		Anemone.crawl("#{url_base}") do |anemone|
+  			anemone.on_every_page do |page|
+      			page_urls << page.url
+      			pg_crawl.title=page.url
+      			pg_crawl.increment
+  			end
+		end
+		pg_crawl.total = 100
+		pg_crawl.finish
+        return page_urls
+	end
+	##
+	# === Description
+	# Takes an array of urls and converts each one to a string. All of these strings are
+	# added to one larger sttring that is returned. The Faraday gem is used to check that
+	# each link is still alive befoe trying to convert it. PDF files are converted to
+	# strigns using the pdf-converter gem. Nokogiri is used here as well to remove
+	# most html markup before adding the urlstring to the main larger string.
+	#
+	# ==== Attribute
+	# * +url_array+ - Array of urls to convert to string
+	# ==== Returns
+	# * +pages_string+ - A partially sterilized string containing all human relevent contents of each given url
+    #
+    # ==== Example
+    # 		base = WebWordSorter.new
+    # 		urls = ["http://www.test1.com","http://www.test2.com","http://www.test3.com"]
+    # 		base.pages_to_string"http://www.example.com")
+	def pages_to_string (url_array)
+		pages_string = String.new
+		url_array.each do |address|
+			if Faraday.head(address).status == 200
+				if ("#{address}"[-4,4] ==".pdf")
+					pdf_file = open (address)
+					pdf_reader = PDF::Reader.new(pdf_file)
+					pdf_reader.pages.each do |page|
+						pages_string =  pages_string + page.text
+					end
+				else
+					data = Nokogiri::HTML(open(address))
+					#take advanatge of the nokogiri script removal
+					#BEFORE converting to string.
+					data.css('script').remove
+					pages_string = pages_string + data
+				end
+			end
+		end
+		return pages_string
+	end
+	##
+	# === Description
+	# Take in a string, remove anything that isnt a letter, or a sinle space,
+	# and return the resutling string.
+	#
+	# ==== Attribute
+	# * +pre_parse_string+ - Sting to be parsed
+	# ==== Returns
+	# * +parsed_string+ - Sting contianing only letters and spaces
+    # ==== Example
+    # 		base = WebWordSorter.new
+    # 		base.parse_string("hello   this is a test 1234   !@#$")
+    #
+    #       will return: "hello this is a test"
+	def parse_string (pre_parse_string)
+			parsed_string = String.new
+			parsed_string = pre_parse_string.gsub(/[-]+/, '')
+			parsed_string = parsed_string.gsub(/[^ a-z A-Z  - ]+/, ' ')
+			parsed_string = parsed_string.squeeze(' ')
+		return parsed_string
+	end
+	##
+	# === Description
+	# Take in a string, split it at each space into an array, and
+	# remove any non unique entires.
+	#
+	# ==== Attribute
+	# * +pre_array_string+ - Sting to be split and have all entires made unique
+	# ==== Returns
+	# * +words_array+ -  array of unique, letter-only, entries
+    # ==== Example
+    # 		base = WebWordSorter.new
+    # 		base.crawler some_string
+	def split_uniq (pre_array_string)
+		words_array = pre_array_string.split
+		words_array = words_array.uniq
+		words_array.map{|x| x.strip }
+		return words_array
+	end
+    ##
+	# === Description
+	# Take in an array and remove any entry that doesnt match an entry in a
+	# pre determined dictionary txt file. This ensures that only valid words
+	# make it out.
+	#
+	# ==== Attribute
+	# * +words+ - array strings
+	# ==== Returns
+	# * +words+ -  array of valid words and determined by comparison to a dictionary txt file
+    # ==== Example
+    # 		base = WebWordSorter.new
+    # 		base.spell_check some_array
+	def spell_check(words)
+        dict_path = File.join( File.dirname(__FILE__), '/resources/words.txt' )
+  		dictionary = IO.readlines(dict_path).map { |line| line.strip }
+  		words.reject { |word| !dictionary.include? word }
+	end
+    ##
+	# === Description
+	# Take in an array and sort it's entries short to long using iterative stooge sort.
+	# ===NOTE: Stooge sort is currently implemented using recursion, not iteration.
+	# I am still working on making it iterative.
+	#
+	# ==== Attribute
+	# * +input_array+ - array to be sorted
+	# ==== Returns
+	# * +sorted_array+ -  array of strings sorted from shortest to longest
+    # ==== Example
+    # 		base = WebWordSorter.new
+    # 		base.stooge_sort some_array
+	def stooge_sort(input_array)
+    sorted_array = input_array.stoogesort
+    return sorted_array
+  end
+    ##
+	# === Description
+	# Take in an array and save its contents to a file. The file is created by the method and
+	# cannot exist prior to execution.
+	#
+	# ==== Attributes
+	# * +file_name+ - name of file to be created and wirtten to
+	# * +array_to_save+ - array to be saved to the txt file
+    # ==== Example
+    # 		base = WebWordSorter.new
+    # 		base.save_array_to_file some_array
+	def save_array_to_file (file_name, array_to_save)
+		File.open("#{file_name}.txt", "w") do |file|
+			array_to_save. each {|word| file.puts(word)}
+		end
+	end
+end
+class Array
+  def stoogesort
+    self.dup.stoogesort!
+  end
+  def stoogesort!(i = 0, j = self.length-1)
+    if self[j].length < self[i].length
+      self[i], self[j] = self[j], self[i]
+    end
+    if j - i > 1
+      t = (j - i + 1)/3
+      stoogesort!(i, j-t)
+      stoogesort!(i+t, j)
+      stoogesort!(i, j-t)
+    end
+    self
+  end
+end

data/lib/examples/example.rb ADDED Viewed

@@ -0,0 +1,49 @@
+require 'WebWordSorter'
+require 'colorize'
+ # Author::    John Z. Abrams  (mailto:jzabrams@unm.edu)
+ # Copyright:: Copyright (c) 2014 John Z. Abrams
+ # License::   Distributed under MIT License
+#
+# == Description
+# This is an example of the use of the WebWordSorter gem.
+# this program will take all of the unique words from
+# a user given website, sort them and save them to a
+# text file with a user given name/location.
+#
+puts "Enter the URL for the website you want to crawl(enter full url including http:// or https://. Ensure the domain prefix is also included if required.):".blue
+url = gets
+puts "Enter a name for the output file. It will be saved in the directory where you run this program.".blue
+file_name = gets
+puts "Crawling #{url.chomp} for all links.This may take a while...".red
+WWS= WebWordSorter.new
+urls= WWS.crawler url
+puts"Crawling complete.".green
+puts"Converting pages to words.".red
+pages_string = WWS.pages_to_string urls
+parsed_string= WWS.parse_string pages_string
+words_array = WWS.split_uniq parsed_string
+puts"Page conversion complete.".green
+puts"Spell checking words.".red
+array = WWS.spell_check(words_array)
+puts "Spell checking complete.".green
+puts "Sorting words. Please wiat...".red
+sorted_array = WWS.stooge_sort array
+puts "Sorting complete.".green
+puts "Saving file #{file_name}".red
+WWS.save_array_to_file(file_name, sorted_array)
+puts "DONE!".green