gimme_poc 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6984d721d974f11b5dab3136323f68d924867ad6
4
- data.tar.gz: 78503da039ce280bb1bb69898065b5d60a2d5463
3
+ metadata.gz: 6ca0656c586244edfaaac44d81b092956e1ed801
4
+ data.tar.gz: b2bbc85c51a79ba5a10a1a5e77d8084ac28adeee
5
5
  SHA512:
6
- metadata.gz: 52770e7b95a98abc5638e7b017674f5c58b61b22bbed847bb009cc5092aea3f533922f0da9bd0a571c5e40050f26779c5734bb21c308f065742c66994f0faf16
7
- data.tar.gz: b8405f565d973365c4bd05d6cab1f52295bd4b83c0fac858654e87d738cb3cf758096b6f5e2402716a2c768e11182479b71a6cad7c18869751a0fdadd9e6a7cf
6
+ metadata.gz: f2e131c8e68fb8a55169f62b1d9a5662901f6fcb45907878c30d8d700d784b66aa8cae0305c060a611da7d60b810cc2d56bb56959db1da378a3088fa6568d707
7
+ data.tar.gz: 6f32874d9fc287ce588baa0e91fa854ed7f75f045f88d798ebf02a0e781918f50d6922ffd06578965d7c033cacdb6a927d03efc7b7967c6a225d7579a8631853
@@ -0,0 +1,55 @@
1
+ # Find the contact
2
+ module Gimme
3
+ class << self
4
+ ##
5
+ # Scans for contact page. If it doesn't work on the first try,
6
+ # It will look for english versions and try again. Processes left to right.
7
+ #
8
+ # Returns nil if no contact page can be found.
9
+ def go_to_contact_page(url)
10
+ contact_page(url) || english_contact_page(url)
11
+ end
12
+
13
+ ##
14
+ # Looks for contact page. Gets page if available.
15
+ # If no contact link is available, it will blind test '../contact'.
16
+ # Returns nil if nothing can be found.
17
+ def contact_page(url)
18
+ contact_link = link_with_href(/contact|Contact/)
19
+ contact_test_page = merged_link('../contact')
20
+
21
+ case
22
+ when !contact_link.nil?
23
+ puts "#{'Success:'.green} Found contact link!\n"
24
+ get(merged_link(contact_link))
25
+ else
26
+ puts "#{'Warning:'.yellow} couldn't find contact link"
27
+ blind_test(contact_test_page) || get(orig_domain(url))
28
+ end
29
+ end
30
+
31
+ ##
32
+ # Looks for english page. Gets page if available then looks for
33
+ # english contact page.
34
+ #
35
+ # If no english link is available,
36
+ # it will blind test '../en' and '../english'.
37
+ # Returns nil if nothing can be found.
38
+ def english_contact_page(url)
39
+ puts "\nLooking for english page..."
40
+ english_link = page.link_with(href: /english|English/)
41
+ test_en_page = merged_link('../en')
42
+ test_english_page = merged_link('../english')
43
+
44
+ case
45
+ when !english_link.nil?
46
+ puts "#{'Success:'.green} found english link!"
47
+ get(merged(english_link)) # already merged link
48
+ else
49
+ blind_test(test_en_page) || blind_test(test_english_page)
50
+ puts 'ready to start again'
51
+ contact_page(url)
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,21 @@
1
+ module Gimme
2
+ # Collection of sites searched.
3
+ class Search
4
+ @all_sites = []
5
+
6
+ class << self
7
+ attr_accessor :all_sites
8
+ end
9
+
10
+ # Each site is saved to this class
11
+ class POC
12
+ attr_accessor :host, :info
13
+
14
+ def initialize(url, contact_info_hsh)
15
+ @host = url
16
+ @info = contact_info_hsh
17
+ Search.all_sites << self
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,33 @@
1
+ # Find the contact
2
+ module Gimme
3
+ class << self
4
+ ##
5
+ # Boolean, returns true if anything is present
6
+ # after running scan_for_contacts and deleting failures.
7
+ def something_to_save?(hsh)
8
+ delete_failures(hsh).any?
9
+ end
10
+
11
+ # Boolean, returns true if email is present.
12
+ def email_available?
13
+ !link_with_href('mailto').nil?
14
+ end
15
+
16
+ # Boolean, returns true if phone number is present.
17
+ def phone_available?
18
+ !(page.body =~ PHONE_REGEX).nil?
19
+ end
20
+
21
+ ##
22
+ # TODO: build better conditional to prevent false positives.
23
+ # There could be other forms like newsletter signup, etc.
24
+ #
25
+ # If there is a form with more than one field, this returns true.
26
+ # Forms with one field are typically search boxes.
27
+ #
28
+ # Boolean, returns true if form is present on page.
29
+ def contactform_available?
30
+ !(page.forms.select { |x| x.fields.length > 1 }.empty?)
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,53 @@
1
+ module Gimme
2
+ class << self
3
+ ##
4
+ # Returns anything that is possible to save, otherwise returns nil.
5
+ # Booleans for phone, email, or contact form will display True or False.
6
+ #
7
+ # Add periods to link hrefs to prevent false positives. Must escape periods
8
+ # with a backslash or else it will be a regex wild card.
9
+ def scan_for_contacts
10
+ {
11
+ contactpage: link_with_href('contact'),
12
+ email_present: "#{email_available?}",
13
+ phone_present: "#{phone_available?}",
14
+ contact_form: "#{contactform_available?}",
15
+ facebook: link_with_href('facebook\.'),
16
+ twitter: link_with_href('twitter\.'),
17
+ youtube: link_with_href('youtube\.'),
18
+ googleplus: link_with_href('plus\.google\.'),
19
+ linkedin: link_with_href('linkedin\.')
20
+ }
21
+ end
22
+
23
+ # Used in save_available_contacts to save each valid link.
24
+ def save_link(key, url)
25
+ return if key.nil? || url.nil?
26
+ @contact_links[key] = url
27
+ end
28
+
29
+ ##
30
+ # Remove negatives from the contacts hash.
31
+ # Deletes a key value pair with a value of either nil or false.
32
+ # Remember that false is a string.
33
+ def delete_failures(hsh)
34
+ hsh.delete_if { |_k, v| v.nil? || v == 'false' }
35
+ end
36
+
37
+ # Saves any available contact info to @contact_links.
38
+ def save_available_contacts(url, hsh = scan_for_contacts)
39
+ return unless something_to_save?(hsh)
40
+ puts "\nsaving available contact information from #{url}"
41
+ if hsh.is_a?(Hash)
42
+ hsh.each do |k, v|
43
+ save_link(k, v) # saves to @contact_links
44
+ end
45
+ delete_failures(@contact_links)
46
+ puts "#{@contact_links}".cyan # same as @contact_links
47
+ else
48
+ fail ArgumentError, "expected hash but got #{hsh.class}"
49
+ end
50
+ Search::POC.new(url, @contact_links)
51
+ end
52
+ end
53
+ end
@@ -1,3 +1,3 @@
1
1
  module Gimme
2
- VERSION = '0.0.3'
2
+ VERSION = '0.0.4'
3
3
  end
@@ -0,0 +1,79 @@
1
+ # Find the contact
2
+ module Gimme
3
+ class << self
4
+ ##
5
+ # Go to a page using Mechanize.
6
+ # Sleep for a split second to not overload any servers.
7
+ #
8
+ # Returns nil if bad url is given.
9
+ def get(str)
10
+ url = format_url(str)
11
+ puts "sending GET request to: #{url}"
12
+ sleep(0.1)
13
+ @page = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }.get(url)
14
+ rescue Mechanize::ResponseCodeError => e
15
+ puts "#{'Response Error:'.red} #{e}"
16
+ rescue SocketError => e
17
+ puts "#{'Socket Error:'.red} #{e}"
18
+ rescue Errno::ETIMEDOUT => e
19
+ puts "#{'Connection Timeout:'.red} #{e}"
20
+ end
21
+
22
+ # Starts/Restarts @contacts_links hash
23
+ def start_contact_links
24
+ puts 'setting contact links hash to {}'
25
+ @contact_links = {}
26
+ end
27
+
28
+ ##
29
+ # Mechanize needs absolute urls to work.
30
+ # If http:// or https:// isn't present, append http://.
31
+ def format_url(str)
32
+ LazyDomain.autohttp(str)
33
+ end
34
+
35
+ ##
36
+ # Outputs domain of a url. Useful if subdomains are given to GimmePOC
37
+ # and they don't work.
38
+ #
39
+ # For example:
40
+ # Given http://maps.google.com, returns 'google.com'.
41
+ def orig_domain(str)
42
+ LazyDomain.parse(str).domain
43
+ end
44
+
45
+ ##
46
+ # Used in case of relative paths. Merging guarantees correct url.
47
+ # This needs a url string as argument to work.
48
+ # Produces a merged uri string.
49
+ def merged_link(url_str)
50
+ page.uri.merge(url_str).to_s
51
+ end
52
+
53
+ ##
54
+ # Expects relative paths and merges everything.
55
+ # Returns a string. If there's nothing, return nil.
56
+ #
57
+ # Add \b word block to ensure whole word is searched.
58
+ def link_with_href(str)
59
+ merged_link(page.link_with(href: /\b#{str}/).uri.to_s)
60
+ rescue
61
+ nil
62
+ end
63
+
64
+ # Boolean, returns true if url is not identical to original domain.
65
+ def subdomain?(str)
66
+ (str != orig_domain(str))
67
+ end
68
+
69
+ # TODO: Sometimes DNS will do a redirect and not give a 404.
70
+ # Need to prevent redirects.
71
+ #
72
+ # Blindly tests to see if a url goes through. If there is a 404 error,
73
+ # this will return nil.
74
+ def blind_test(url)
75
+ puts "\nblind testing: #{url}"
76
+ get(url)
77
+ end
78
+ end
79
+ end
data/lib/gimme_poc.rb CHANGED
@@ -1,6 +1,12 @@
1
- require 'mechanize'
2
1
  require 'colored'
2
+ require 'lazy_domain'
3
+ require 'mechanize'
4
+ require_relative './gimme_poc/contactpage'
5
+ require_relative './gimme_poc/poc'
6
+ require_relative './gimme_poc/questions'
7
+ require_relative './gimme_poc/save'
3
8
  require_relative './gimme_poc/version'
9
+ require_relative './gimme_poc/web'
4
10
 
5
11
  # Find the contact
6
12
  module Gimme
@@ -8,51 +14,10 @@ module Gimme
8
14
  attr_accessor :page, :contact, :contact_links, :url
9
15
 
10
16
  # Simple regex that looks for ###.#### or ###-####
11
- PHONE_REGEX = /\d{3}[-]\d{4}|\d{3}[.]\d{4}/
17
+ PHONE_REGEX = %r{/\d{3}[-]\d{4}|\d{3}[.]\d{4}/}
12
18
 
13
19
  # Captures http:// and https://
14
- HTTP_REGEX = /\A\bhttps:\/\/|\bhttp:\/\//
15
-
16
- ## ----------------------------------------------------------------
17
- # Questions
18
- #
19
- #
20
- #
21
-
22
- ##
23
- # Boolean, returns true if anything is present
24
- # after running scan_for_contacts.
25
- def something_to_save?
26
- scan_for_contacts.any?
27
- end
28
-
29
- # Boolean, returns true if email is present.
30
- def email_available?
31
- !link_with_href('mailto').nil?
32
- end
33
-
34
- # Boolean, returns true if phone number is present.
35
- def phone_available?
36
- !(page.body =~ PHONE_REGEX).nil?
37
- end
38
-
39
- ##
40
- # TODO: build better conditional to prevent false positives.
41
- # There could be other forms like newsletter signup, etc.
42
- #
43
- # If there is a form with more than one field, this returns true.
44
- # Forms with one field are typically search boxes.
45
- #
46
- # Boolean, returns true if form is present on page.
47
- def contactform_available?
48
- !(page.forms.select { |x| x.fields.length > 1 }.empty?)
49
- end
50
-
51
- ## ----------------------------------------------------------------
52
- # Actions
53
- #
54
- #
55
- #
20
+ HTTP_REGEX = %r{/\A\bhttps:\/\/|\bhttp:\/\//}
56
21
 
57
22
  ##
58
23
  # The main method!
@@ -63,175 +28,21 @@ module Gimme
63
28
  arr.each do |url|
64
29
  puts '-' * 50
65
30
  puts "starting: #{url}"
66
- next if get(url).nil?
31
+ case
32
+ when subdomain?(url)
33
+ get(orig_domain(url)) if get(url).nil?
34
+ else
35
+ next if get(url).nil?
36
+ end
67
37
  puts 'now looking for contact pages'
68
38
  start_contact_links
69
- mechpage = go_to_contact_page
39
+ mechpage = go_to_contact_page(url)
70
40
  next if mechpage.nil?
71
41
  save_available_contacts(mechpage.uri.to_s)
72
42
  end
73
43
  Search.all_sites
74
44
  end
75
45
 
76
- # Mechanize needs absolute urls to work.
77
- # If http:// or https:// isn't present, append http://.
78
- def format_url(str)
79
- str.prepend('http://') if (str =~ HTTP_REGEX).nil?
80
- str
81
- end
82
-
83
- ##
84
- # Go to a page using Mechanize.
85
- # Sleep for a split second to not overload any servers.
86
- #
87
- # Returns nil if bad url is given.
88
- def get(str)
89
- url = format_url(str)
90
- puts "sending GET request to: #{url}"
91
- sleep(0.1)
92
- @page = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }.get(url)
93
- rescue SocketError => e
94
- puts "#{'skipping:'.red} -- #{e}"
95
- end
96
-
97
- # Starts/Restarts @contacts_links hash
98
- def start_contact_links
99
- puts 'setting contact links hash to {}'
100
- @contact_links = {}
101
- end
102
-
103
- ##
104
- # Scans for contact page. If it doesn't work on the first try,
105
- # It will look for english versions and try again. Processes left to right.
106
- #
107
- # Returns nil if no contact page can be found.
108
- def go_to_contact_page
109
- contact_page || english_contact_page
110
- end
111
-
112
- ##
113
- # Looks for contact page. Gets page if available.
114
- # If no contact link is available, it will blind test '../contact'.
115
- # Returns nil if nothing can be found.
116
- def contact_page
117
- contact_link = link_with_href(/contact|Contact/)
118
- contact_test_page = merged_link('../contact')
119
-
120
- case
121
- when !contact_link.nil?
122
- puts "#{'success:'.green} Found contact link!\n"
123
- get(merged_link(contact_link))
124
- else
125
- puts "#{'warning:'.yellow}couldn't find contact link"
126
- blind_test(contact_test_page)
127
- end
128
- end
129
-
130
- ##
131
- # Looks for english page. Gets page if available then looks for
132
- # english contact page.
133
- #
134
- # If no english link is available,
135
- # it will blind test '../en' and '../english'.
136
- # Returns nil if nothing can be found.
137
- def english_contact_page
138
- puts "\nLooking for english page..."
139
- english_link = page.link_with(href: /english|English/)
140
- test_en_page = merged_link('../en')
141
- test_english_page = merged_link('../english')
142
-
143
- case
144
- when !english_link.nil?
145
- puts "#{'success:'.green} found english link!"
146
- get(merged(english_link)) # already merged link
147
- else
148
- blind_test(test_en_page) || blind_test(test_english_page)
149
- puts 'ready to start again'
150
- contact_page
151
- end
152
- end
153
-
154
- # TODO: Sometimes DNS will do a redirect and not give a 404.
155
- # Need to prevent redirects.
156
- #
157
- # Blindly tests to see if a url goes through. If there is a 404 error,
158
- # this will return nil.
159
- def blind_test(url)
160
- puts "\nblind testing: #{url}"
161
- get(url)
162
- rescue Mechanize::ResponseCodeError
163
- puts "#{'404 Error:'.red} #{url}"
164
- end
165
-
166
- ##
167
- # Used in case of relative paths. Merging guarantees correct url.
168
- # This needs a url string as argument to work.
169
- # Produces a merged uri string.
170
- def merged_link(url_str)
171
- page.uri.merge(url_str).to_s
172
- end
173
-
174
- ##
175
- # Expects relative paths and merges everything.
176
- # Returns a string. If there's nothing, return nil.
177
- #
178
- # Add \b word block to ensure whole word is searched.
179
- def link_with_href(str)
180
- merged_link(page.link_with(href: /\b#{str}/).uri.to_s)
181
- rescue
182
- nil
183
- end
184
-
185
- ##
186
- # Returns anything that is possible to save, otherwise returns nil.
187
- # Booleans for phone, email, or contact form will display True or False.
188
- #
189
- # Add periods to link hrefs to prevent false positives. Must escape periods
190
- # with a backslash or else it will be a regex wild card.
191
- def scan_for_contacts
192
- {
193
- contactpage: link_with_href('contact'),
194
- email_present: "#{email_available?}",
195
- phone_present: "#{phone_available?}",
196
- contact_form: "#{contactform_available?}",
197
- facebook: link_with_href('facebook\.'),
198
- twitter: link_with_href('twitter\.'),
199
- youtube: link_with_href('youtube\.'),
200
- googleplus: link_with_href('plus\.google\.'),
201
- linkedin: link_with_href('linkedin\.')
202
- }
203
- end
204
-
205
- # Used in save_available_contacts to save each valid link.
206
- def save_link(key, url)
207
- return if key.nil? || url.nil?
208
- @contact_links[key] = url
209
- end
210
-
211
- ##
212
- # Remove negatives from the contacts hash.
213
- # Deletes a key value pair with a value of either nil or false.
214
- # Remember that false is a string.
215
- def delete_failures(hsh)
216
- hsh.delete_if { |_k, v| v.nil? || v == 'false' }
217
- end
218
-
219
- # Saves any available contact info to @contact_links.
220
- def save_available_contacts(url, hsh = scan_for_contacts)
221
- puts "\nsaving available contact information from #{url}"
222
- return unless something_to_save?
223
- if hsh.is_a?(Hash)
224
- hsh.each do |k, v|
225
- save_link(k, v) # saves to @contact_links
226
- end
227
- delete_failures(@contact_links)
228
- puts "#{@contact_links}".cyan # same as @contact_links
229
- else
230
- fail ArgumentError, "expected hash but got #{hsh.class}"
231
- end
232
- Search::POC.new(url, @contact_links)
233
- end
234
-
235
46
  # Convenience method.
236
47
  def memory
237
48
  Search.all_sites
@@ -243,21 +54,3 @@ module Gimme
243
54
  end
244
55
  end
245
56
  end
246
-
247
- # Collection of sites searched.
248
- class Search
249
- @all_sites = []
250
-
251
- class << self
252
- attr_accessor :all_sites
253
- end
254
-
255
- # Each site is saved to this class
256
- class POC
257
- def initialize(url, contact_info_hsh)
258
- @host = url
259
- @info = contact_info_hsh
260
- Search.all_sites << self
261
- end
262
- end
263
- end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gimme_poc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Mason
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-09 00:00:00.000000000 Z
11
+ date: 2015-10-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '1.2'
41
+ - !ruby/object:Gem::Dependency
42
+ name: lazy_domain
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.0.1
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.0.1
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rspec
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -91,7 +105,12 @@ files:
91
105
  - README.md
92
106
  - Rakefile
93
107
  - lib/gimme_poc.rb
108
+ - lib/gimme_poc/contactpage.rb
109
+ - lib/gimme_poc/poc.rb
110
+ - lib/gimme_poc/questions.rb
111
+ - lib/gimme_poc/save.rb
94
112
  - lib/gimme_poc/version.rb
113
+ - lib/gimme_poc/web.rb
95
114
  homepage: http://github.com/m8ss/gimme_poc
96
115
  licenses:
97
116
  - MIT