WebWordSorter 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: bbb277b659ed1e9907e2223f3384bed97240aa4e
4
+ data.tar.gz: 253a40ff16ee7810edb4f61f2765fdcc7577685b
5
+ SHA512:
6
+ metadata.gz: e2dd2461f42c708a90540bc0e02b615c59f27901717e1b8954e0e6807aaef0122a89f5a29eb7c13df25b871192e4d84b3e7ce1fee1ba896354f8ab187e83a1e5
7
+ data.tar.gz: b258421a958485c0079628a5fc6b8f0cee29ba25bab65a78d2022a4ea4f46b28e864f7cf711113055b6646ce6fba40c5fb7f4870c09053bb83ab39ecac9f585d
@@ -0,0 +1,249 @@
1
+ require 'nokogiri'
2
+ require 'anemone'
3
+ require 'open-uri'
4
+ require 'faraday'
5
+ require 'pdf/reader'
6
+ require 'rubygems'
7
+ require 'ruby-progressbar'
8
+
9
+ # Author:: John Z. Abrams (mailto:jzabrams@unm.edu)
10
+ # Copyright:: Copyright (c) 2014 John Z. Abrams
11
+ # License:: Distributed under MIT License
12
+
13
+ ##
14
+ # == Description
15
+ # This class contains all the necessary methods to do the following:
16
+ # -Crawl most websites and return an array of their URLS.
17
+ # -Convert all HTML and most web linked PDF documents to one large string given an array of urls.
18
+ # -Parse out all non words and non human sensible markup.
19
+ # -Stooge Sort an array of words via Iteration, <b>NOT recursion. NOTE: Current verison
20
+ # IS using recursive stooge sort!</b>
21
+ # -Write array the words to file.
22
+ #
23
+ # When used in properly this class will take all unique words on a website, parse them, sort them, and write them to a .txt file.
24
+ #
25
+ class WebWordSorter
26
+ ##
27
+ # === Description
28
+ # Use anemone gem to crawl each page of a given site and store each url to
29
+ # and array which is returned.
30
+ #
31
+ # ==== Attribute
32
+ # * +url_base+ - Base URL for the crawler to start
33
+ # ==== Returns
34
+ # * +page_urls+ - An array that contains all of the linked urls for a given domain
35
+ # ==== Example
36
+ # base = WebWordSorter.new
37
+ # base.crawler("http://www.example.com")
38
+ def crawler (url_base)
39
+ pg_crawl = ProgressBar.create(:title => "Crawling: #{url_base}", :starting_at => 20, :total => nil, :format => '%a %t')
40
+
41
+ page_urls = Array.new
42
+
43
+ Anemone.crawl("#{url_base}") do |anemone|
44
+ anemone.on_every_page do |page|
45
+
46
+ page_urls << page.url
47
+
48
+ pg_crawl.title=page.url
49
+ pg_crawl.increment
50
+
51
+
52
+ end
53
+ end
54
+ pg_crawl.total = 100
55
+ pg_crawl.finish
56
+ return page_urls
57
+ end
58
+
59
+ ##
60
+ # === Description
61
+ # Takes an array of urls and converts each one to a string. All of these strings are
62
+ # added to one larger sttring that is returned. The Faraday gem is used to check that
63
+ # each link is still alive befoe trying to convert it. PDF files are converted to
64
+ # strigns using the pdf-converter gem. Nokogiri is used here as well to remove
65
+ # most html markup before adding the urlstring to the main larger string.
66
+ #
67
+ # ==== Attribute
68
+ # * +url_array+ - Array of urls to convert to string
69
+ # ==== Returns
70
+ # * +pages_string+ - A partially sterilized string containing all human relevent contents of each given url
71
+ #
72
+ # ==== Example
73
+ # base = WebWordSorter.new
74
+ # urls = ["http://www.test1.com","http://www.test2.com","http://www.test3.com"]
75
+ # base.pages_to_string"http://www.example.com")
76
+ def pages_to_string (url_array)
77
+
78
+ pages_string = String.new
79
+
80
+ url_array.each do |address|
81
+
82
+ if Faraday.head(address).status == 200
83
+
84
+ if ("#{address}"[-4,4] ==".pdf")
85
+
86
+ pdf_file = open (address)
87
+
88
+ pdf_reader = PDF::Reader.new(pdf_file)
89
+
90
+
91
+ pdf_reader.pages.each do |page|
92
+ pages_string = pages_string + page.text
93
+ end
94
+ else
95
+ data = Nokogiri::HTML(open(address))
96
+
97
+ #take advanatge of the nokogiri script removal
98
+ #BEFORE converting to string.
99
+ data.css('script').remove
100
+
101
+ pages_string = pages_string + data
102
+ end
103
+ end
104
+ end
105
+
106
+ return pages_string
107
+ end
108
+
109
+ ##
110
+ # === Description
111
+ # Take in a string, remove anything that isnt a letter, or a sinle space,
112
+ # and return the resutling string.
113
+ #
114
+ # ==== Attribute
115
+ # * +pre_parse_string+ - Sting to be parsed
116
+ # ==== Returns
117
+ # * +parsed_string+ - Sting contianing only letters and spaces
118
+ # ==== Example
119
+ # base = WebWordSorter.new
120
+ # base.parse_string("hello this is a test 1234 !@#$")
121
+ #
122
+ # will return: "hello this is a test"
123
+ def parse_string (pre_parse_string)
124
+
125
+ parsed_string = String.new
126
+ parsed_string = pre_parse_string.gsub(/[-]+/, '')
127
+ parsed_string = parsed_string.gsub(/[^ a-z A-Z - ]+/, ' ')
128
+ parsed_string = parsed_string.squeeze(' ')
129
+
130
+ return parsed_string
131
+ end
132
+
133
+ ##
134
+ # === Description
135
+ # Take in a string, split it at each space into an array, and
136
+ # remove any non unique entires.
137
+ #
138
+ # ==== Attribute
139
+ # * +pre_array_string+ - Sting to be split and have all entires made unique
140
+ # ==== Returns
141
+ # * +words_array+ - array of unique, letter-only, entries
142
+ # ==== Example
143
+ # base = WebWordSorter.new
144
+ # base.crawler some_string
145
+ def split_uniq (pre_array_string)
146
+
147
+ words_array = pre_array_string.split
148
+
149
+ words_array = words_array.uniq
150
+
151
+ words_array.map{|x| x.strip }
152
+
153
+ return words_array
154
+
155
+ end
156
+
157
+ ##
158
+ # === Description
159
+ # Take in an array and remove any entry that doesnt match an entry in a
160
+ # pre determined dictionary txt file. This ensures that only valid words
161
+ # make it out.
162
+ #
163
+ # ==== Attribute
164
+ # * +words+ - array strings
165
+ # ==== Returns
166
+ # * +words+ - array of valid words and determined by comparison to a dictionary txt file
167
+ # ==== Example
168
+ # base = WebWordSorter.new
169
+ # base.spell_check some_array
170
+ def spell_check(words)
171
+
172
+ dict_path = File.join( File.dirname(__FILE__), '/resources/words.txt' )
173
+
174
+ dictionary = IO.readlines(dict_path).map { |line| line.strip }
175
+
176
+ words.reject { |word| !dictionary.include? word }
177
+
178
+ end
179
+
180
+ ##
181
+ # === Description
182
+ # Take in an array and sort it's entries short to long using iterative stooge sort.
183
+ # ===NOTE: Stooge sort is currently implemented using recursion, not iteration.
184
+ # I am still working on making it iterative.
185
+ #
186
+ # ==== Attribute
187
+ # * +input_array+ - array to be sorted
188
+ # ==== Returns
189
+ # * +sorted_array+ - array of strings sorted from shortest to longest
190
+ # ==== Example
191
+ # base = WebWordSorter.new
192
+ # base.stooge_sort some_array
193
+ def stooge_sort(input_array)
194
+
195
+ sorted_array = input_array.stoogesort
196
+
197
+ return sorted_array
198
+ end
199
+
200
+ ##
201
+ # === Description
202
+ # Take in an array and save its contents to a file. The file is created by the method and
203
+ # cannot exist prior to execution.
204
+ #
205
+ # ==== Attributes
206
+ # * +file_name+ - name of file to be created and wirtten to
207
+ # * +array_to_save+ - array to be saved to the txt file
208
+ # ==== Example
209
+ # base = WebWordSorter.new
210
+ # base.save_array_to_file some_array
211
+ def save_array_to_file (file_name, array_to_save)
212
+
213
+ File.open("#{file_name}.txt", "w") do |file|
214
+
215
+ array_to_save. each {|word| file.puts(word)}
216
+
217
+ end
218
+ end
219
+ end
220
+
221
+
222
+ class Array
223
+ def stoogesort
224
+ self.dup.stoogesort!
225
+ end
226
+
227
+ def stoogesort!(i = 0, j = self.length-1)
228
+ if self[j].length < self[i].length
229
+ self[i], self[j] = self[j], self[i]
230
+ end
231
+ if j - i > 1
232
+ t = (j - i + 1)/3
233
+ stoogesort!(i, j-t)
234
+ stoogesort!(i+t, j)
235
+ stoogesort!(i, j-t)
236
+ end
237
+ self
238
+ end
239
+ end
240
+
241
+
242
+
243
+
244
+
245
+
246
+
247
+
248
+
249
+
@@ -0,0 +1,49 @@
1
+ require 'WebWordSorter'
2
+ require 'colorize'
3
+
4
+ # Author:: John Z. Abrams (mailto:jzabrams@unm.edu)
5
+ # Copyright:: Copyright (c) 2014 John Z. Abrams
6
+ # License:: Distributed under MIT License
7
+
8
+
9
+ #
10
+ # == Description
11
+ # This is an example of the use of the WebWordSorter gem.
12
+ # this program will take all of the unique words from
13
+ # a user given website, sort them and save them to a
14
+ # text file with a user given name/location.
15
+ #
16
+
17
+ puts "Enter the URL for the website you want to crawl(enter full url including http:// or https://. Ensure the domain prefix is also included if required.):".blue
18
+ url = gets
19
+
20
+
21
+
22
+ puts "Enter a name for the output file. It will be saved in the directory where you run this program.".blue
23
+ file_name = gets
24
+
25
+ puts "Crawling #{url.chomp} for all links.This may take a while...".red
26
+ WWS= WebWordSorter.new
27
+ urls= WWS.crawler url
28
+ puts"Crawling complete.".green
29
+
30
+ puts"Converting pages to words.".red
31
+ pages_string = WWS.pages_to_string urls
32
+ parsed_string= WWS.parse_string pages_string
33
+ words_array = WWS.split_uniq parsed_string
34
+ puts"Page conversion complete.".green
35
+
36
+ puts"Spell checking words.".red
37
+ array = WWS.spell_check(words_array)
38
+ puts "Spell checking complete.".green
39
+
40
+ puts "Sorting words. Please wiat...".red
41
+ sorted_array = WWS.stooge_sort array
42
+ puts "Sorting complete.".green
43
+
44
+ puts "Saving file #{file_name}".red
45
+ WWS.save_array_to_file(file_name, sorted_array)
46
+ puts "DONE!".green
47
+
48
+
49
+