WebWordSorter 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/WebWordSorter.rb +249 -0
- data/lib/examples/example.rb +49 -0
- data/lib/resources/words.txt +235887 -0
- data/lib/test/WWS_test_cases.rb +145 -0
- metadata +137 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: bbb277b659ed1e9907e2223f3384bed97240aa4e
|
4
|
+
data.tar.gz: 253a40ff16ee7810edb4f61f2765fdcc7577685b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e2dd2461f42c708a90540bc0e02b615c59f27901717e1b8954e0e6807aaef0122a89f5a29eb7c13df25b871192e4d84b3e7ce1fee1ba896354f8ab187e83a1e5
|
7
|
+
data.tar.gz: b258421a958485c0079628a5fc6b8f0cee29ba25bab65a78d2022a4ea4f46b28e864f7cf711113055b6646ce6fba40c5fb7f4870c09053bb83ab39ecac9f585d
|
@@ -0,0 +1,249 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'anemone'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'faraday'
|
5
|
+
require 'pdf/reader'
|
6
|
+
require 'rubygems'
|
7
|
+
require 'ruby-progressbar'
|
8
|
+
|
9
|
+
# Author:: John Z. Abrams (mailto:jzabrams@unm.edu)
|
10
|
+
# Copyright:: Copyright (c) 2014 John Z. Abrams
|
11
|
+
# License:: Distributed under MIT License
|
12
|
+
|
13
|
+
##
|
14
|
+
# == Description
|
15
|
+
# This class contains all the necessary methods to do the following:
|
16
|
+
# -Crawl most websites and return an array of their URLS.
|
17
|
+
# -Convert all HTML and most web linked PDF documents to one large string given an array of urls.
|
18
|
+
# -Parse out all non words and non human sensible markup.
|
19
|
+
# -Stooge Sort an array of words via Iteration, <b>NOT recursion. NOTE: Current verison
|
20
|
+
# IS using recursive stooge sort!</b>
|
21
|
+
# -Write array the words to file.
|
22
|
+
#
|
23
|
+
# When used in properly this class will take all unique words on a website, parse them, sort them, and write them to a .txt file.
|
24
|
+
#
|
25
|
+
class WebWordSorter
|
26
|
+
##
|
27
|
+
# === Description
|
28
|
+
# Use anemone gem to crawl each page of a given site and store each url to
|
29
|
+
# and array which is returned.
|
30
|
+
#
|
31
|
+
# ==== Attribute
|
32
|
+
# * +url_base+ - Base URL for the crawler to start
|
33
|
+
# ==== Returns
|
34
|
+
# * +page_urls+ - An array that contains all of the linked urls for a given domain
|
35
|
+
# ==== Example
|
36
|
+
# base = WebWordSorter.new
|
37
|
+
# base.crawler("http://www.example.com")
|
38
|
+
def crawler (url_base)
|
39
|
+
pg_crawl = ProgressBar.create(:title => "Crawling: #{url_base}", :starting_at => 20, :total => nil, :format => '%a %t')
|
40
|
+
|
41
|
+
page_urls = Array.new
|
42
|
+
|
43
|
+
Anemone.crawl("#{url_base}") do |anemone|
|
44
|
+
anemone.on_every_page do |page|
|
45
|
+
|
46
|
+
page_urls << page.url
|
47
|
+
|
48
|
+
pg_crawl.title=page.url
|
49
|
+
pg_crawl.increment
|
50
|
+
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
54
|
+
pg_crawl.total = 100
|
55
|
+
pg_crawl.finish
|
56
|
+
return page_urls
|
57
|
+
end
|
58
|
+
|
59
|
+
##
|
60
|
+
# === Description
|
61
|
+
# Takes an array of urls and converts each one to a string. All of these strings are
|
62
|
+
# added to one larger sttring that is returned. The Faraday gem is used to check that
|
63
|
+
# each link is still alive befoe trying to convert it. PDF files are converted to
|
64
|
+
# strigns using the pdf-converter gem. Nokogiri is used here as well to remove
|
65
|
+
# most html markup before adding the urlstring to the main larger string.
|
66
|
+
#
|
67
|
+
# ==== Attribute
|
68
|
+
# * +url_array+ - Array of urls to convert to string
|
69
|
+
# ==== Returns
|
70
|
+
# * +pages_string+ - A partially sterilized string containing all human relevent contents of each given url
|
71
|
+
#
|
72
|
+
# ==== Example
|
73
|
+
# base = WebWordSorter.new
|
74
|
+
# urls = ["http://www.test1.com","http://www.test2.com","http://www.test3.com"]
|
75
|
+
# base.pages_to_string"http://www.example.com")
|
76
|
+
def pages_to_string (url_array)
|
77
|
+
|
78
|
+
pages_string = String.new
|
79
|
+
|
80
|
+
url_array.each do |address|
|
81
|
+
|
82
|
+
if Faraday.head(address).status == 200
|
83
|
+
|
84
|
+
if ("#{address}"[-4,4] ==".pdf")
|
85
|
+
|
86
|
+
pdf_file = open (address)
|
87
|
+
|
88
|
+
pdf_reader = PDF::Reader.new(pdf_file)
|
89
|
+
|
90
|
+
|
91
|
+
pdf_reader.pages.each do |page|
|
92
|
+
pages_string = pages_string + page.text
|
93
|
+
end
|
94
|
+
else
|
95
|
+
data = Nokogiri::HTML(open(address))
|
96
|
+
|
97
|
+
#take advanatge of the nokogiri script removal
|
98
|
+
#BEFORE converting to string.
|
99
|
+
data.css('script').remove
|
100
|
+
|
101
|
+
pages_string = pages_string + data
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
return pages_string
|
107
|
+
end
|
108
|
+
|
109
|
+
##
|
110
|
+
# === Description
|
111
|
+
# Take in a string, remove anything that isnt a letter, or a sinle space,
|
112
|
+
# and return the resutling string.
|
113
|
+
#
|
114
|
+
# ==== Attribute
|
115
|
+
# * +pre_parse_string+ - Sting to be parsed
|
116
|
+
# ==== Returns
|
117
|
+
# * +parsed_string+ - Sting contianing only letters and spaces
|
118
|
+
# ==== Example
|
119
|
+
# base = WebWordSorter.new
|
120
|
+
# base.parse_string("hello this is a test 1234 !@#$")
|
121
|
+
#
|
122
|
+
# will return: "hello this is a test"
|
123
|
+
def parse_string (pre_parse_string)
|
124
|
+
|
125
|
+
parsed_string = String.new
|
126
|
+
parsed_string = pre_parse_string.gsub(/[-]+/, '')
|
127
|
+
parsed_string = parsed_string.gsub(/[^ a-z A-Z - ]+/, ' ')
|
128
|
+
parsed_string = parsed_string.squeeze(' ')
|
129
|
+
|
130
|
+
return parsed_string
|
131
|
+
end
|
132
|
+
|
133
|
+
##
|
134
|
+
# === Description
|
135
|
+
# Take in a string, split it at each space into an array, and
|
136
|
+
# remove any non unique entires.
|
137
|
+
#
|
138
|
+
# ==== Attribute
|
139
|
+
# * +pre_array_string+ - Sting to be split and have all entires made unique
|
140
|
+
# ==== Returns
|
141
|
+
# * +words_array+ - array of unique, letter-only, entries
|
142
|
+
# ==== Example
|
143
|
+
# base = WebWordSorter.new
|
144
|
+
# base.crawler some_string
|
145
|
+
def split_uniq (pre_array_string)
|
146
|
+
|
147
|
+
words_array = pre_array_string.split
|
148
|
+
|
149
|
+
words_array = words_array.uniq
|
150
|
+
|
151
|
+
words_array.map{|x| x.strip }
|
152
|
+
|
153
|
+
return words_array
|
154
|
+
|
155
|
+
end
|
156
|
+
|
157
|
+
##
|
158
|
+
# === Description
|
159
|
+
# Take in an array and remove any entry that doesnt match an entry in a
|
160
|
+
# pre determined dictionary txt file. This ensures that only valid words
|
161
|
+
# make it out.
|
162
|
+
#
|
163
|
+
# ==== Attribute
|
164
|
+
# * +words+ - array strings
|
165
|
+
# ==== Returns
|
166
|
+
# * +words+ - array of valid words and determined by comparison to a dictionary txt file
|
167
|
+
# ==== Example
|
168
|
+
# base = WebWordSorter.new
|
169
|
+
# base.spell_check some_array
|
170
|
+
def spell_check(words)
|
171
|
+
|
172
|
+
dict_path = File.join( File.dirname(__FILE__), '/resources/words.txt' )
|
173
|
+
|
174
|
+
dictionary = IO.readlines(dict_path).map { |line| line.strip }
|
175
|
+
|
176
|
+
words.reject { |word| !dictionary.include? word }
|
177
|
+
|
178
|
+
end
|
179
|
+
|
180
|
+
##
|
181
|
+
# === Description
|
182
|
+
# Take in an array and sort it's entries short to long using iterative stooge sort.
|
183
|
+
# ===NOTE: Stooge sort is currently implemented using recursion, not iteration.
|
184
|
+
# I am still working on making it iterative.
|
185
|
+
#
|
186
|
+
# ==== Attribute
|
187
|
+
# * +input_array+ - array to be sorted
|
188
|
+
# ==== Returns
|
189
|
+
# * +sorted_array+ - array of strings sorted from shortest to longest
|
190
|
+
# ==== Example
|
191
|
+
# base = WebWordSorter.new
|
192
|
+
# base.stooge_sort some_array
|
193
|
+
def stooge_sort(input_array)
|
194
|
+
|
195
|
+
sorted_array = input_array.stoogesort
|
196
|
+
|
197
|
+
return sorted_array
|
198
|
+
end
|
199
|
+
|
200
|
+
##
|
201
|
+
# === Description
|
202
|
+
# Take in an array and save its contents to a file. The file is created by the method and
|
203
|
+
# cannot exist prior to execution.
|
204
|
+
#
|
205
|
+
# ==== Attributes
|
206
|
+
# * +file_name+ - name of file to be created and wirtten to
|
207
|
+
# * +array_to_save+ - array to be saved to the txt file
|
208
|
+
# ==== Example
|
209
|
+
# base = WebWordSorter.new
|
210
|
+
# base.save_array_to_file some_array
|
211
|
+
def save_array_to_file (file_name, array_to_save)
|
212
|
+
|
213
|
+
File.open("#{file_name}.txt", "w") do |file|
|
214
|
+
|
215
|
+
array_to_save. each {|word| file.puts(word)}
|
216
|
+
|
217
|
+
end
|
218
|
+
end
|
219
|
+
end
|
220
|
+
|
221
|
+
|
222
|
+
class Array
|
223
|
+
def stoogesort
|
224
|
+
self.dup.stoogesort!
|
225
|
+
end
|
226
|
+
|
227
|
+
def stoogesort!(i = 0, j = self.length-1)
|
228
|
+
if self[j].length < self[i].length
|
229
|
+
self[i], self[j] = self[j], self[i]
|
230
|
+
end
|
231
|
+
if j - i > 1
|
232
|
+
t = (j - i + 1)/3
|
233
|
+
stoogesort!(i, j-t)
|
234
|
+
stoogesort!(i+t, j)
|
235
|
+
stoogesort!(i, j-t)
|
236
|
+
end
|
237
|
+
self
|
238
|
+
end
|
239
|
+
end
|
240
|
+
|
241
|
+
|
242
|
+
|
243
|
+
|
244
|
+
|
245
|
+
|
246
|
+
|
247
|
+
|
248
|
+
|
249
|
+
|
@@ -0,0 +1,49 @@
|
|
1
|
+
require 'WebWordSorter'
|
2
|
+
require 'colorize'
|
3
|
+
|
4
|
+
# Author:: John Z. Abrams (mailto:jzabrams@unm.edu)
|
5
|
+
# Copyright:: Copyright (c) 2014 John Z. Abrams
|
6
|
+
# License:: Distributed under MIT License
|
7
|
+
|
8
|
+
|
9
|
+
#
|
10
|
+
# == Description
|
11
|
+
# This is an example of the use of the WebWordSorter gem.
|
12
|
+
# this program will take all of the unique words from
|
13
|
+
# a user given website, sort them and save them to a
|
14
|
+
# text file with a user given name/location.
|
15
|
+
#
|
16
|
+
|
17
|
+
puts "Enter the URL for the website you want to crawl(enter full url including http:// or https://. Ensure the domain prefix is also included if required.):".blue
|
18
|
+
url = gets
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
puts "Enter a name for the output file. It will be saved in the directory where you run this program.".blue
|
23
|
+
file_name = gets
|
24
|
+
|
25
|
+
puts "Crawling #{url.chomp} for all links.This may take a while...".red
|
26
|
+
WWS= WebWordSorter.new
|
27
|
+
urls= WWS.crawler url
|
28
|
+
puts"Crawling complete.".green
|
29
|
+
|
30
|
+
puts"Converting pages to words.".red
|
31
|
+
pages_string = WWS.pages_to_string urls
|
32
|
+
parsed_string= WWS.parse_string pages_string
|
33
|
+
words_array = WWS.split_uniq parsed_string
|
34
|
+
puts"Page conversion complete.".green
|
35
|
+
|
36
|
+
puts"Spell checking words.".red
|
37
|
+
array = WWS.spell_check(words_array)
|
38
|
+
puts "Spell checking complete.".green
|
39
|
+
|
40
|
+
puts "Sorting words. Please wiat...".red
|
41
|
+
sorted_array = WWS.stooge_sort array
|
42
|
+
puts "Sorting complete.".green
|
43
|
+
|
44
|
+
puts "Saving file #{file_name}".red
|
45
|
+
WWS.save_array_to_file(file_name, sorted_array)
|
46
|
+
puts "DONE!".green
|
47
|
+
|
48
|
+
|
49
|
+
|