WebWordSorter 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/WebWordSorter.rb +249 -0
- data/lib/examples/example.rb +49 -0
- data/lib/resources/words.txt +235887 -0
- data/lib/test/WWS_test_cases.rb +145 -0
- metadata +137 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: bbb277b659ed1e9907e2223f3384bed97240aa4e
|
4
|
+
data.tar.gz: 253a40ff16ee7810edb4f61f2765fdcc7577685b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e2dd2461f42c708a90540bc0e02b615c59f27901717e1b8954e0e6807aaef0122a89f5a29eb7c13df25b871192e4d84b3e7ce1fee1ba896354f8ab187e83a1e5
|
7
|
+
data.tar.gz: b258421a958485c0079628a5fc6b8f0cee29ba25bab65a78d2022a4ea4f46b28e864f7cf711113055b6646ce6fba40c5fb7f4870c09053bb83ab39ecac9f585d
|
@@ -0,0 +1,249 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'anemone'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'faraday'
|
5
|
+
require 'pdf/reader'
|
6
|
+
require 'rubygems'
|
7
|
+
require 'ruby-progressbar'
|
8
|
+
|
9
|
+
# Author:: John Z. Abrams (mailto:jzabrams@unm.edu)
|
10
|
+
# Copyright:: Copyright (c) 2014 John Z. Abrams
|
11
|
+
# License:: Distributed under MIT License
|
12
|
+
|
13
|
+
##
|
14
|
+
# == Description
|
15
|
+
# This class contains all the necessary methods to do the following:
|
16
|
+
# -Crawl most websites and return an array of their URLS.
|
17
|
+
# -Convert all HTML and most web linked PDF documents to one large string given an array of urls.
|
18
|
+
# -Parse out all non words and non human sensible markup.
|
19
|
+
# -Stooge Sort an array of words via Iteration, <b>NOT recursion. NOTE: Current verison
|
20
|
+
# IS using recursive stooge sort!</b>
|
21
|
+
# -Write array the words to file.
|
22
|
+
#
|
23
|
+
# When used in properly this class will take all unique words on a website, parse them, sort them, and write them to a .txt file.
|
24
|
+
#
|
25
|
+
class WebWordSorter
|
26
|
+
##
|
27
|
+
# === Description
|
28
|
+
# Use anemone gem to crawl each page of a given site and store each url to
|
29
|
+
# and array which is returned.
|
30
|
+
#
|
31
|
+
# ==== Attribute
|
32
|
+
# * +url_base+ - Base URL for the crawler to start
|
33
|
+
# ==== Returns
|
34
|
+
# * +page_urls+ - An array that contains all of the linked urls for a given domain
|
35
|
+
# ==== Example
|
36
|
+
# base = WebWordSorter.new
|
37
|
+
# base.crawler("http://www.example.com")
|
38
|
+
def crawler (url_base)
|
39
|
+
pg_crawl = ProgressBar.create(:title => "Crawling: #{url_base}", :starting_at => 20, :total => nil, :format => '%a %t')
|
40
|
+
|
41
|
+
page_urls = Array.new
|
42
|
+
|
43
|
+
Anemone.crawl("#{url_base}") do |anemone|
|
44
|
+
anemone.on_every_page do |page|
|
45
|
+
|
46
|
+
page_urls << page.url
|
47
|
+
|
48
|
+
pg_crawl.title=page.url
|
49
|
+
pg_crawl.increment
|
50
|
+
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
54
|
+
pg_crawl.total = 100
|
55
|
+
pg_crawl.finish
|
56
|
+
return page_urls
|
57
|
+
end
|
58
|
+
|
59
|
+
##
|
60
|
+
# === Description
|
61
|
+
# Takes an array of urls and converts each one to a string. All of these strings are
|
62
|
+
# added to one larger sttring that is returned. The Faraday gem is used to check that
|
63
|
+
# each link is still alive befoe trying to convert it. PDF files are converted to
|
64
|
+
# strigns using the pdf-converter gem. Nokogiri is used here as well to remove
|
65
|
+
# most html markup before adding the urlstring to the main larger string.
|
66
|
+
#
|
67
|
+
# ==== Attribute
|
68
|
+
# * +url_array+ - Array of urls to convert to string
|
69
|
+
# ==== Returns
|
70
|
+
# * +pages_string+ - A partially sterilized string containing all human relevent contents of each given url
|
71
|
+
#
|
72
|
+
# ==== Example
|
73
|
+
# base = WebWordSorter.new
|
74
|
+
# urls = ["http://www.test1.com","http://www.test2.com","http://www.test3.com"]
|
75
|
+
# base.pages_to_string"http://www.example.com")
|
76
|
+
def pages_to_string (url_array)
|
77
|
+
|
78
|
+
pages_string = String.new
|
79
|
+
|
80
|
+
url_array.each do |address|
|
81
|
+
|
82
|
+
if Faraday.head(address).status == 200
|
83
|
+
|
84
|
+
if ("#{address}"[-4,4] ==".pdf")
|
85
|
+
|
86
|
+
pdf_file = open (address)
|
87
|
+
|
88
|
+
pdf_reader = PDF::Reader.new(pdf_file)
|
89
|
+
|
90
|
+
|
91
|
+
pdf_reader.pages.each do |page|
|
92
|
+
pages_string = pages_string + page.text
|
93
|
+
end
|
94
|
+
else
|
95
|
+
data = Nokogiri::HTML(open(address))
|
96
|
+
|
97
|
+
#take advanatge of the nokogiri script removal
|
98
|
+
#BEFORE converting to string.
|
99
|
+
data.css('script').remove
|
100
|
+
|
101
|
+
pages_string = pages_string + data
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
return pages_string
|
107
|
+
end
|
108
|
+
|
109
|
+
##
|
110
|
+
# === Description
|
111
|
+
# Take in a string, remove anything that isnt a letter, or a sinle space,
|
112
|
+
# and return the resutling string.
|
113
|
+
#
|
114
|
+
# ==== Attribute
|
115
|
+
# * +pre_parse_string+ - Sting to be parsed
|
116
|
+
# ==== Returns
|
117
|
+
# * +parsed_string+ - Sting contianing only letters and spaces
|
118
|
+
# ==== Example
|
119
|
+
# base = WebWordSorter.new
|
120
|
+
# base.parse_string("hello this is a test 1234 !@#$")
|
121
|
+
#
|
122
|
+
# will return: "hello this is a test"
|
123
|
+
def parse_string (pre_parse_string)
|
124
|
+
|
125
|
+
parsed_string = String.new
|
126
|
+
parsed_string = pre_parse_string.gsub(/[-]+/, '')
|
127
|
+
parsed_string = parsed_string.gsub(/[^ a-z A-Z - ]+/, ' ')
|
128
|
+
parsed_string = parsed_string.squeeze(' ')
|
129
|
+
|
130
|
+
return parsed_string
|
131
|
+
end
|
132
|
+
|
133
|
+
##
|
134
|
+
# === Description
|
135
|
+
# Take in a string, split it at each space into an array, and
|
136
|
+
# remove any non unique entires.
|
137
|
+
#
|
138
|
+
# ==== Attribute
|
139
|
+
# * +pre_array_string+ - Sting to be split and have all entires made unique
|
140
|
+
# ==== Returns
|
141
|
+
# * +words_array+ - array of unique, letter-only, entries
|
142
|
+
# ==== Example
|
143
|
+
# base = WebWordSorter.new
|
144
|
+
# base.crawler some_string
|
145
|
+
def split_uniq (pre_array_string)
|
146
|
+
|
147
|
+
words_array = pre_array_string.split
|
148
|
+
|
149
|
+
words_array = words_array.uniq
|
150
|
+
|
151
|
+
words_array.map{|x| x.strip }
|
152
|
+
|
153
|
+
return words_array
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
##
|
158
|
+
# === Description
|
159
|
+
# Take in an array and remove any entry that doesnt match an entry in a
|
160
|
+
# pre determined dictionary txt file. This ensures that only valid words
|
161
|
+
# make it out.
|
162
|
+
#
|
163
|
+
# ==== Attribute
|
164
|
+
# * +words+ - array strings
|
165
|
+
# ==== Returns
|
166
|
+
# * +words+ - array of valid words and determined by comparison to a dictionary txt file
|
167
|
+
# ==== Example
|
168
|
+
# base = WebWordSorter.new
|
169
|
+
# base.spell_check some_array
|
170
|
+
def spell_check(words)
|
171
|
+
|
172
|
+
dict_path = File.join( File.dirname(__FILE__), '/resources/words.txt' )
|
173
|
+
|
174
|
+
dictionary = IO.readlines(dict_path).map { |line| line.strip }
|
175
|
+
|
176
|
+
words.reject { |word| !dictionary.include? word }
|
177
|
+
|
178
|
+
end
|
179
|
+
|
180
|
+
##
|
181
|
+
# === Description
|
182
|
+
# Take in an array and sort it's entries short to long using iterative stooge sort.
|
183
|
+
# ===NOTE: Stooge sort is currently implemented using recursion, not iteration.
|
184
|
+
# I am still working on making it iterative.
|
185
|
+
#
|
186
|
+
# ==== Attribute
|
187
|
+
# * +input_array+ - array to be sorted
|
188
|
+
# ==== Returns
|
189
|
+
# * +sorted_array+ - array of strings sorted from shortest to longest
|
190
|
+
# ==== Example
|
191
|
+
# base = WebWordSorter.new
|
192
|
+
# base.stooge_sort some_array
|
193
|
+
def stooge_sort(input_array)
|
194
|
+
|
195
|
+
sorted_array = input_array.stoogesort
|
196
|
+
|
197
|
+
return sorted_array
|
198
|
+
end
|
199
|
+
|
200
|
+
##
|
201
|
+
# === Description
|
202
|
+
# Take in an array and save its contents to a file. The file is created by the method and
|
203
|
+
# cannot exist prior to execution.
|
204
|
+
#
|
205
|
+
# ==== Attributes
|
206
|
+
# * +file_name+ - name of file to be created and wirtten to
|
207
|
+
# * +array_to_save+ - array to be saved to the txt file
|
208
|
+
# ==== Example
|
209
|
+
# base = WebWordSorter.new
|
210
|
+
# base.save_array_to_file some_array
|
211
|
+
def save_array_to_file (file_name, array_to_save)
|
212
|
+
|
213
|
+
File.open("#{file_name}.txt", "w") do |file|
|
214
|
+
|
215
|
+
array_to_save. each {|word| file.puts(word)}
|
216
|
+
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
|
222
|
+
class Array
|
223
|
+
def stoogesort
|
224
|
+
self.dup.stoogesort!
|
225
|
+
end
|
226
|
+
|
227
|
+
def stoogesort!(i = 0, j = self.length-1)
|
228
|
+
if self[j].length < self[i].length
|
229
|
+
self[i], self[j] = self[j], self[i]
|
230
|
+
end
|
231
|
+
if j - i > 1
|
232
|
+
t = (j - i + 1)/3
|
233
|
+
stoogesort!(i, j-t)
|
234
|
+
stoogesort!(i+t, j)
|
235
|
+
stoogesort!(i, j-t)
|
236
|
+
end
|
237
|
+
self
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
|
242
|
+
|
243
|
+
|
244
|
+
|
245
|
+
|
246
|
+
|
247
|
+
|
248
|
+
|
249
|
+
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'WebWordSorter'
|
2
|
+
require 'colorize'
|
3
|
+
|
4
|
+
# Author:: John Z. Abrams (mailto:jzabrams@unm.edu)
|
5
|
+
# Copyright:: Copyright (c) 2014 John Z. Abrams
|
6
|
+
# License:: Distributed under MIT License
|
7
|
+
|
8
|
+
|
9
|
+
#
|
10
|
+
# == Description
|
11
|
+
# This is an example of the use of the WebWordSorter gem.
|
12
|
+
# this program will take all of the unique words from
|
13
|
+
# a user given website, sort them and save them to a
|
14
|
+
# text file with a user given name/location.
|
15
|
+
#
|
16
|
+
|
17
|
+
puts "Enter the URL for the website you want to crawl(enter full url including http:// or https://. Ensure the domain prefix is also included if required.):".blue
|
18
|
+
url = gets
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
puts "Enter a name for the output file. It will be saved in the directory where you run this program.".blue
|
23
|
+
file_name = gets
|
24
|
+
|
25
|
+
puts "Crawling #{url.chomp} for all links.This may take a while...".red
|
26
|
+
WWS= WebWordSorter.new
|
27
|
+
urls= WWS.crawler url
|
28
|
+
puts"Crawling complete.".green
|
29
|
+
|
30
|
+
puts"Converting pages to words.".red
|
31
|
+
pages_string = WWS.pages_to_string urls
|
32
|
+
parsed_string= WWS.parse_string pages_string
|
33
|
+
words_array = WWS.split_uniq parsed_string
|
34
|
+
puts"Page conversion complete.".green
|
35
|
+
|
36
|
+
puts"Spell checking words.".red
|
37
|
+
array = WWS.spell_check(words_array)
|
38
|
+
puts "Spell checking complete.".green
|
39
|
+
|
40
|
+
puts "Sorting words. Please wiat...".red
|
41
|
+
sorted_array = WWS.stooge_sort array
|
42
|
+
puts "Sorting complete.".green
|
43
|
+
|
44
|
+
puts "Saving file #{file_name}".red
|
45
|
+
WWS.save_array_to_file(file_name, sorted_array)
|
46
|
+
puts "DONE!".green
|
47
|
+
|
48
|
+
|
49
|
+
|