WebWordSorter 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: bbb277b659ed1e9907e2223f3384bed97240aa4e
4
+ data.tar.gz: 253a40ff16ee7810edb4f61f2765fdcc7577685b
5
+ SHA512:
6
+ metadata.gz: e2dd2461f42c708a90540bc0e02b615c59f27901717e1b8954e0e6807aaef0122a89f5a29eb7c13df25b871192e4d84b3e7ce1fee1ba896354f8ab187e83a1e5
7
+ data.tar.gz: b258421a958485c0079628a5fc6b8f0cee29ba25bab65a78d2022a4ea4f46b28e864f7cf711113055b6646ce6fba40c5fb7f4870c09053bb83ab39ecac9f585d
@@ -0,0 +1,249 @@
1
+ require 'nokogiri'
2
+ require 'anemone'
3
+ require 'open-uri'
4
+ require 'faraday'
5
+ require 'pdf/reader'
6
+ require 'rubygems'
7
+ require 'ruby-progressbar'
8
+
9
+ # Author:: John Z. Abrams (mailto:jzabrams@unm.edu)
10
+ # Copyright:: Copyright (c) 2014 John Z. Abrams
11
+ # License:: Distributed under MIT License
12
+
13
+ ##
14
+ # == Description
15
+ # This class contains all the necessary methods to do the following:
16
+ # -Crawl most websites and return an array of their URLS.
17
+ # -Convert all HTML and most web linked PDF documents to one large string given an array of urls.
18
+ # -Parse out all non words and non human sensible markup.
19
+ # -Stooge Sort an array of words via Iteration, <b>NOT recursion. NOTE: Current verison
20
+ # IS using recursive stooge sort!</b>
21
+ # -Write array the words to file.
22
+ #
23
+ # When used in properly this class will take all unique words on a website, parse them, sort them, and write them to a .txt file.
24
+ #
25
+ class WebWordSorter
26
+ ##
27
+ # === Description
28
+ # Use anemone gem to crawl each page of a given site and store each url to
29
+ # and array which is returned.
30
+ #
31
+ # ==== Attribute
32
+ # * +url_base+ - Base URL for the crawler to start
33
+ # ==== Returns
34
+ # * +page_urls+ - An array that contains all of the linked urls for a given domain
35
+ # ==== Example
36
+ # base = WebWordSorter.new
37
+ # base.crawler("http://www.example.com")
38
+ def crawler (url_base)
39
+ pg_crawl = ProgressBar.create(:title => "Crawling: #{url_base}", :starting_at => 20, :total => nil, :format => '%a %t')
40
+
41
+ page_urls = Array.new
42
+
43
+ Anemone.crawl("#{url_base}") do |anemone|
44
+ anemone.on_every_page do |page|
45
+
46
+ page_urls << page.url
47
+
48
+ pg_crawl.title=page.url
49
+ pg_crawl.increment
50
+
51
+
52
+ end
53
+ end
54
+ pg_crawl.total = 100
55
+ pg_crawl.finish
56
+ return page_urls
57
+ end
58
+
59
+ ##
60
+ # === Description
61
+ # Takes an array of urls and converts each one to a string. All of these strings are
62
+ # added to one larger sttring that is returned. The Faraday gem is used to check that
63
+ # each link is still alive befoe trying to convert it. PDF files are converted to
64
+ # strigns using the pdf-converter gem. Nokogiri is used here as well to remove
65
+ # most html markup before adding the urlstring to the main larger string.
66
+ #
67
+ # ==== Attribute
68
+ # * +url_array+ - Array of urls to convert to string
69
+ # ==== Returns
70
+ # * +pages_string+ - A partially sterilized string containing all human relevent contents of each given url
71
+ #
72
+ # ==== Example
73
+ # base = WebWordSorter.new
74
+ # urls = ["http://www.test1.com","http://www.test2.com","http://www.test3.com"]
75
+ # base.pages_to_string"http://www.example.com")
76
+ def pages_to_string (url_array)
77
+
78
+ pages_string = String.new
79
+
80
+ url_array.each do |address|
81
+
82
+ if Faraday.head(address).status == 200
83
+
84
+ if ("#{address}"[-4,4] ==".pdf")
85
+
86
+ pdf_file = open (address)
87
+
88
+ pdf_reader = PDF::Reader.new(pdf_file)
89
+
90
+
91
+ pdf_reader.pages.each do |page|
92
+ pages_string = pages_string + page.text
93
+ end
94
+ else
95
+ data = Nokogiri::HTML(open(address))
96
+
97
+ #take advanatge of the nokogiri script removal
98
+ #BEFORE converting to string.
99
+ data.css('script').remove
100
+
101
+ pages_string = pages_string + data
102
+ end
103
+ end
104
+ end
105
+
106
+ return pages_string
107
+ end
108
+
109
+ ##
110
+ # === Description
111
+ # Take in a string, remove anything that isnt a letter, or a sinle space,
112
+ # and return the resutling string.
113
+ #
114
+ # ==== Attribute
115
+ # * +pre_parse_string+ - Sting to be parsed
116
+ # ==== Returns
117
+ # * +parsed_string+ - Sting contianing only letters and spaces
118
+ # ==== Example
119
+ # base = WebWordSorter.new
120
+ # base.parse_string("hello this is a test 1234 !@#$")
121
+ #
122
+ # will return: "hello this is a test"
123
+ def parse_string (pre_parse_string)
124
+
125
+ parsed_string = String.new
126
+ parsed_string = pre_parse_string.gsub(/[-]+/, '')
127
+ parsed_string = parsed_string.gsub(/[^ a-z A-Z - ]+/, ' ')
128
+ parsed_string = parsed_string.squeeze(' ')
129
+
130
+ return parsed_string
131
+ end
132
+
133
+ ##
134
+ # === Description
135
+ # Take in a string, split it at each space into an array, and
136
+ # remove any non unique entires.
137
+ #
138
+ # ==== Attribute
139
+ # * +pre_array_string+ - Sting to be split and have all entires made unique
140
+ # ==== Returns
141
+ # * +words_array+ - array of unique, letter-only, entries
142
+ # ==== Example
143
+ # base = WebWordSorter.new
144
+ # base.crawler some_string
145
+ def split_uniq (pre_array_string)
146
+
147
+ words_array = pre_array_string.split
148
+
149
+ words_array = words_array.uniq
150
+
151
+ words_array.map{|x| x.strip }
152
+
153
+ return words_array
154
+
155
+ end
156
+
157
+ ##
158
+ # === Description
159
+ # Take in an array and remove any entry that doesnt match an entry in a
160
+ # pre determined dictionary txt file. This ensures that only valid words
161
+ # make it out.
162
+ #
163
+ # ==== Attribute
164
+ # * +words+ - array strings
165
+ # ==== Returns
166
+ # * +words+ - array of valid words and determined by comparison to a dictionary txt file
167
+ # ==== Example
168
+ # base = WebWordSorter.new
169
+ # base.spell_check some_array
170
+ def spell_check(words)
171
+
172
+ dict_path = File.join( File.dirname(__FILE__), '/resources/words.txt' )
173
+
174
+ dictionary = IO.readlines(dict_path).map { |line| line.strip }
175
+
176
+ words.reject { |word| !dictionary.include? word }
177
+
178
+ end
179
+
180
+ ##
181
+ # === Description
182
+ # Take in an array and sort it's entries short to long using iterative stooge sort.
183
+ # ===NOTE: Stooge sort is currently implemented using recursion, not iteration.
184
+ # I am still working on making it iterative.
185
+ #
186
+ # ==== Attribute
187
+ # * +input_array+ - array to be sorted
188
+ # ==== Returns
189
+ # * +sorted_array+ - array of strings sorted from shortest to longest
190
+ # ==== Example
191
+ # base = WebWordSorter.new
192
+ # base.stooge_sort some_array
193
+ def stooge_sort(input_array)
194
+
195
+ sorted_array = input_array.stoogesort
196
+
197
+ return sorted_array
198
+ end
199
+
200
+ ##
201
+ # === Description
202
+ # Take in an array and save its contents to a file. The file is created by the method and
203
+ # cannot exist prior to execution.
204
+ #
205
+ # ==== Attributes
206
+ # * +file_name+ - name of file to be created and wirtten to
207
+ # * +array_to_save+ - array to be saved to the txt file
208
+ # ==== Example
209
+ # base = WebWordSorter.new
210
+ # base.save_array_to_file some_array
211
+ def save_array_to_file (file_name, array_to_save)
212
+
213
+ File.open("#{file_name}.txt", "w") do |file|
214
+
215
+ array_to_save. each {|word| file.puts(word)}
216
+
217
+ end
218
+ end
219
+ end
220
+
221
+
222
+ class Array
223
+ def stoogesort
224
+ self.dup.stoogesort!
225
+ end
226
+
227
+ def stoogesort!(i = 0, j = self.length-1)
228
+ if self[j].length < self[i].length
229
+ self[i], self[j] = self[j], self[i]
230
+ end
231
+ if j - i > 1
232
+ t = (j - i + 1)/3
233
+ stoogesort!(i, j-t)
234
+ stoogesort!(i+t, j)
235
+ stoogesort!(i, j-t)
236
+ end
237
+ self
238
+ end
239
+ end
240
+
241
+
242
+
243
+
244
+
245
+
246
+
247
+
248
+
249
+
@@ -0,0 +1,49 @@
1
+ require 'WebWordSorter'
2
+ require 'colorize'
3
+
4
+ # Author:: John Z. Abrams (mailto:jzabrams@unm.edu)
5
+ # Copyright:: Copyright (c) 2014 John Z. Abrams
6
+ # License:: Distributed under MIT License
7
+
8
+
9
+ #
10
+ # == Description
11
+ # This is an example of the use of the WebWordSorter gem.
12
+ # this program will take all of the unique words from
13
+ # a user given website, sort them and save them to a
14
+ # text file with a user given name/location.
15
+ #
16
+
17
+ puts "Enter the URL for the website you want to crawl(enter full url including http:// or https://. Ensure the domain prefix is also included if required.):".blue
18
+ url = gets
19
+
20
+
21
+
22
+ puts "Enter a name for the output file. It will be saved in the directory where you run this program.".blue
23
+ file_name = gets
24
+
25
+ puts "Crawling #{url.chomp} for all links.This may take a while...".red
26
+ WWS= WebWordSorter.new
27
+ urls= WWS.crawler url
28
+ puts"Crawling complete.".green
29
+
30
+ puts"Converting pages to words.".red
31
+ pages_string = WWS.pages_to_string urls
32
+ parsed_string= WWS.parse_string pages_string
33
+ words_array = WWS.split_uniq parsed_string
34
+ puts"Page conversion complete.".green
35
+
36
+ puts"Spell checking words.".red
37
+ array = WWS.spell_check(words_array)
38
+ puts "Spell checking complete.".green
39
+
40
+ puts "Sorting words. Please wiat...".red
41
+ sorted_array = WWS.stooge_sort array
42
+ puts "Sorting complete.".green
43
+
44
+ puts "Saving file #{file_name}".red
45
+ WWS.save_array_to_file(file_name, sorted_array)
46
+ puts "DONE!".green
47
+
48
+
49
+