gimme_poc 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6984d721d974f11b5dab3136323f68d924867ad6
4
- data.tar.gz: 78503da039ce280bb1bb69898065b5d60a2d5463
3
+ metadata.gz: 6ca0656c586244edfaaac44d81b092956e1ed801
4
+ data.tar.gz: b2bbc85c51a79ba5a10a1a5e77d8084ac28adeee
5
5
  SHA512:
6
- metadata.gz: 52770e7b95a98abc5638e7b017674f5c58b61b22bbed847bb009cc5092aea3f533922f0da9bd0a571c5e40050f26779c5734bb21c308f065742c66994f0faf16
7
- data.tar.gz: b8405f565d973365c4bd05d6cab1f52295bd4b83c0fac858654e87d738cb3cf758096b6f5e2402716a2c768e11182479b71a6cad7c18869751a0fdadd9e6a7cf
6
+ metadata.gz: f2e131c8e68fb8a55169f62b1d9a5662901f6fcb45907878c30d8d700d784b66aa8cae0305c060a611da7d60b810cc2d56bb56959db1da378a3088fa6568d707
7
+ data.tar.gz: 6f32874d9fc287ce588baa0e91fa854ed7f75f045f88d798ebf02a0e781918f50d6922ffd06578965d7c033cacdb6a927d03efc7b7967c6a225d7579a8631853
@@ -0,0 +1,55 @@
1
+ # Find the contact
2
+ module Gimme
3
+ class << self
4
+ ##
5
+ # Scans for contact page. If it doesn't work on the first try,
6
+ # It will look for english versions and try again. Processes left to right.
7
+ #
8
+ # Returns nil if no contact page can be found.
9
+ def go_to_contact_page(url)
10
+ contact_page(url) || english_contact_page(url)
11
+ end
12
+
13
+ ##
14
+ # Looks for contact page. Gets page if available.
15
+ # If no contact link is available, it will blind test '../contact'.
16
+ # Returns nil if nothing can be found.
17
+ def contact_page(url)
18
+ contact_link = link_with_href(/contact|Contact/)
19
+ contact_test_page = merged_link('../contact')
20
+
21
+ case
22
+ when !contact_link.nil?
23
+ puts "#{'Success:'.green} Found contact link!\n"
24
+ get(merged_link(contact_link))
25
+ else
26
+ puts "#{'Warning:'.yellow} couldn't find contact link"
27
+ blind_test(contact_test_page) || get(orig_domain(url))
28
+ end
29
+ end
30
+
31
+ ##
32
+ # Looks for english page. Gets page if available then looks for
33
+ # english contact page.
34
+ #
35
+ # If no english link is available,
36
+ # it will blind test '../en' and '../english'.
37
+ # Returns nil if nothing can be found.
38
+ def english_contact_page(url)
39
+ puts "\nLooking for english page..."
40
+ english_link = page.link_with(href: /english|English/)
41
+ test_en_page = merged_link('../en')
42
+ test_english_page = merged_link('../english')
43
+
44
+ case
45
+ when !english_link.nil?
46
+ puts "#{'Success:'.green} found english link!"
47
+ get(merged(english_link)) # already merged link
48
+ else
49
+ blind_test(test_en_page) || blind_test(test_english_page)
50
+ puts 'ready to start again'
51
+ contact_page(url)
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,21 @@
1
+ module Gimme
2
+ # Collection of sites searched.
3
+ class Search
4
+ @all_sites = []
5
+
6
+ class << self
7
+ attr_accessor :all_sites
8
+ end
9
+
10
+ # Each site is saved to this class
11
+ class POC
12
+ attr_accessor :host, :info
13
+
14
+ def initialize(url, contact_info_hsh)
15
+ @host = url
16
+ @info = contact_info_hsh
17
+ Search.all_sites << self
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,33 @@
1
+ # Find the contact
2
+ module Gimme
3
+ class << self
4
+ ##
5
+ # Boolean, returns true if anything is present
6
+ # after running scan_for_contacts and deleting failures.
7
+ def something_to_save?(hsh)
8
+ delete_failures(hsh).any?
9
+ end
10
+
11
+ # Boolean, returns true if email is present.
12
+ def email_available?
13
+ !link_with_href('mailto').nil?
14
+ end
15
+
16
+ # Boolean, returns true if phone number is present.
17
+ def phone_available?
18
+ !(page.body =~ PHONE_REGEX).nil?
19
+ end
20
+
21
+ ##
22
+ # TODO: build better conditional to prevent false positives.
23
+ # There could be other forms like newsletter signup, etc.
24
+ #
25
+ # If there is a form with more than one field, this returns true.
26
+ # Forms with one field are typically search boxes.
27
+ #
28
+ # Boolean, returns true if form is present on page.
29
+ def contactform_available?
30
+ !(page.forms.select { |x| x.fields.length > 1 }.empty?)
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,53 @@
1
+ module Gimme
2
+ class << self
3
+ ##
4
+ # Returns anything that is possible to save, otherwise returns nil.
5
+ # Booleans for phone, email, or contact form will display True or False.
6
+ #
7
+ # Add periods to link hrefs to prevent false positives. Must escape periods
8
+ # with a backslash or else it will be a regex wild card.
9
+ def scan_for_contacts
10
+ {
11
+ contactpage: link_with_href('contact'),
12
+ email_present: "#{email_available?}",
13
+ phone_present: "#{phone_available?}",
14
+ contact_form: "#{contactform_available?}",
15
+ facebook: link_with_href('facebook\.'),
16
+ twitter: link_with_href('twitter\.'),
17
+ youtube: link_with_href('youtube\.'),
18
+ googleplus: link_with_href('plus\.google\.'),
19
+ linkedin: link_with_href('linkedin\.')
20
+ }
21
+ end
22
+
23
+ # Used in save_available_contacts to save each valid link.
24
+ def save_link(key, url)
25
+ return if key.nil? || url.nil?
26
+ @contact_links[key] = url
27
+ end
28
+
29
+ ##
30
+ # Remove negatives from the contacts hash.
31
+ # Deletes a key value pair with a value of either nil or false.
32
+ # Remember that false is a string.
33
+ def delete_failures(hsh)
34
+ hsh.delete_if { |_k, v| v.nil? || v == 'false' }
35
+ end
36
+
37
+ # Saves any available contact info to @contact_links.
38
+ def save_available_contacts(url, hsh = scan_for_contacts)
39
+ return unless something_to_save?(hsh)
40
+ puts "\nsaving available contact information from #{url}"
41
+ if hsh.is_a?(Hash)
42
+ hsh.each do |k, v|
43
+ save_link(k, v) # saves to @contact_links
44
+ end
45
+ delete_failures(@contact_links)
46
+ puts "#{@contact_links}".cyan # same as @contact_links
47
+ else
48
+ fail ArgumentError, "expected hash but got #{hsh.class}"
49
+ end
50
+ Search::POC.new(url, @contact_links)
51
+ end
52
+ end
53
+ end
@@ -1,3 +1,3 @@
1
1
  module Gimme
2
- VERSION = '0.0.3'
2
+ VERSION = '0.0.4'
3
3
  end
@@ -0,0 +1,79 @@
1
+ # Find the contact
2
+ module Gimme
3
+ class << self
4
+ ##
5
+ # Go to a page using Mechanize.
6
+ # Sleep for a split second to not overload any servers.
7
+ #
8
+ # Returns nil if bad url is given.
9
+ def get(str)
10
+ url = format_url(str)
11
+ puts "sending GET request to: #{url}"
12
+ sleep(0.1)
13
+ @page = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }.get(url)
14
+ rescue Mechanize::ResponseCodeError => e
15
+ puts "#{'Response Error:'.red} #{e}"
16
+ rescue SocketError => e
17
+ puts "#{'Socket Error:'.red} #{e}"
18
+ rescue Errno::ETIMEDOUT => e
19
+ puts "#{'Connection Timeout:'.red} #{e}"
20
+ end
21
+
22
+ # Starts/Restarts @contacts_links hash
23
+ def start_contact_links
24
+ puts 'setting contact links hash to {}'
25
+ @contact_links = {}
26
+ end
27
+
28
+ ##
29
+ # Mechanize needs absolute urls to work.
30
+ # If http:// or https:// isn't present, append http://.
31
+ def format_url(str)
32
+ LazyDomain.autohttp(str)
33
+ end
34
+
35
+ ##
36
+ # Outputs domain of a url. Useful if subdomains are given to GimmePOC
37
+ # and they don't work.
38
+ #
39
+ # For example:
40
+ # Given http://maps.google.com, returns 'google.com'.
41
+ def orig_domain(str)
42
+ LazyDomain.parse(str).domain
43
+ end
44
+
45
+ ##
46
+ # Used in case of relative paths. Merging guarantees correct url.
47
+ # This needs a url string as argument to work.
48
+ # Produces a merged uri string.
49
+ def merged_link(url_str)
50
+ page.uri.merge(url_str).to_s
51
+ end
52
+
53
+ ##
54
+ # Expects relative paths and merges everything.
55
+ # Returns a string. If there's nothing, return nil.
56
+ #
57
+ # Add \b word block to ensure whole word is searched.
58
+ def link_with_href(str)
59
+ merged_link(page.link_with(href: /\b#{str}/).uri.to_s)
60
+ rescue
61
+ nil
62
+ end
63
+
64
+ # Boolean, returns true if url is not identical to original domain.
65
+ def subdomain?(str)
66
+ (str != orig_domain(str))
67
+ end
68
+
69
+ # TODO: Sometimes DNS will do a redirect and not give a 404.
70
+ # Need to prevent redirects.
71
+ #
72
+ # Blindly tests to see if a url goes through. If there is a 404 error,
73
+ # this will return nil.
74
+ def blind_test(url)
75
+ puts "\nblind testing: #{url}"
76
+ get(url)
77
+ end
78
+ end
79
+ end
data/lib/gimme_poc.rb CHANGED
@@ -1,6 +1,12 @@
1
- require 'mechanize'
2
1
  require 'colored'
2
+ require 'lazy_domain'
3
+ require 'mechanize'
4
+ require_relative './gimme_poc/contactpage'
5
+ require_relative './gimme_poc/poc'
6
+ require_relative './gimme_poc/questions'
7
+ require_relative './gimme_poc/save'
3
8
  require_relative './gimme_poc/version'
9
+ require_relative './gimme_poc/web'
4
10
 
5
11
  # Find the contact
6
12
  module Gimme
@@ -8,51 +14,10 @@ module Gimme
8
14
  attr_accessor :page, :contact, :contact_links, :url
9
15
 
10
16
  # Simple regex that looks for ###.#### or ###-####
11
- PHONE_REGEX = /\d{3}[-]\d{4}|\d{3}[.]\d{4}/
17
+ PHONE_REGEX = %r{/\d{3}[-]\d{4}|\d{3}[.]\d{4}/}
12
18
 
13
19
  # Captures http:// and https://
14
- HTTP_REGEX = /\A\bhttps:\/\/|\bhttp:\/\//
15
-
16
- ## ----------------------------------------------------------------
17
- # Questions
18
- #
19
- #
20
- #
21
-
22
- ##
23
- # Boolean, returns true if anything is present
24
- # after running scan_for_contacts.
25
- def something_to_save?
26
- scan_for_contacts.any?
27
- end
28
-
29
- # Boolean, returns true if email is present.
30
- def email_available?
31
- !link_with_href('mailto').nil?
32
- end
33
-
34
- # Boolean, returns true if phone number is present.
35
- def phone_available?
36
- !(page.body =~ PHONE_REGEX).nil?
37
- end
38
-
39
- ##
40
- # TODO: build better conditional to prevent false positives.
41
- # There could be other forms like newsletter signup, etc.
42
- #
43
- # If there is a form with more than one field, this returns true.
44
- # Forms with one field are typically search boxes.
45
- #
46
- # Boolean, returns true if form is present on page.
47
- def contactform_available?
48
- !(page.forms.select { |x| x.fields.length > 1 }.empty?)
49
- end
50
-
51
- ## ----------------------------------------------------------------
52
- # Actions
53
- #
54
- #
55
- #
20
+ HTTP_REGEX = %r{/\A\bhttps:\/\/|\bhttp:\/\//}
56
21
 
57
22
  ##
58
23
  # The main method!
@@ -63,175 +28,21 @@ module Gimme
63
28
  arr.each do |url|
64
29
  puts '-' * 50
65
30
  puts "starting: #{url}"
66
- next if get(url).nil?
31
+ case
32
+ when subdomain?(url)
33
+ get(orig_domain(url)) if get(url).nil?
34
+ else
35
+ next if get(url).nil?
36
+ end
67
37
  puts 'now looking for contact pages'
68
38
  start_contact_links
69
- mechpage = go_to_contact_page
39
+ mechpage = go_to_contact_page(url)
70
40
  next if mechpage.nil?
71
41
  save_available_contacts(mechpage.uri.to_s)
72
42
  end
73
43
  Search.all_sites
74
44
  end
75
45
 
76
- # Mechanize needs absolute urls to work.
77
- # If http:// or https:// isn't present, append http://.
78
- def format_url(str)
79
- str.prepend('http://') if (str =~ HTTP_REGEX).nil?
80
- str
81
- end
82
-
83
- ##
84
- # Go to a page using Mechanize.
85
- # Sleep for a split second to not overload any servers.
86
- #
87
- # Returns nil if bad url is given.
88
- def get(str)
89
- url = format_url(str)
90
- puts "sending GET request to: #{url}"
91
- sleep(0.1)
92
- @page = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }.get(url)
93
- rescue SocketError => e
94
- puts "#{'skipping:'.red} -- #{e}"
95
- end
96
-
97
- # Starts/Restarts @contacts_links hash
98
- def start_contact_links
99
- puts 'setting contact links hash to {}'
100
- @contact_links = {}
101
- end
102
-
103
- ##
104
- # Scans for contact page. If it doesn't work on the first try,
105
- # It will look for english versions and try again. Processes left to right.
106
- #
107
- # Returns nil if no contact page can be found.
108
- def go_to_contact_page
109
- contact_page || english_contact_page
110
- end
111
-
112
- ##
113
- # Looks for contact page. Gets page if available.
114
- # If no contact link is available, it will blind test '../contact'.
115
- # Returns nil if nothing can be found.
116
- def contact_page
117
- contact_link = link_with_href(/contact|Contact/)
118
- contact_test_page = merged_link('../contact')
119
-
120
- case
121
- when !contact_link.nil?
122
- puts "#{'success:'.green} Found contact link!\n"
123
- get(merged_link(contact_link))
124
- else
125
- puts "#{'warning:'.yellow}couldn't find contact link"
126
- blind_test(contact_test_page)
127
- end
128
- end
129
-
130
- ##
131
- # Looks for english page. Gets page if available then looks for
132
- # english contact page.
133
- #
134
- # If no english link is available,
135
- # it will blind test '../en' and '../english'.
136
- # Returns nil if nothing can be found.
137
- def english_contact_page
138
- puts "\nLooking for english page..."
139
- english_link = page.link_with(href: /english|English/)
140
- test_en_page = merged_link('../en')
141
- test_english_page = merged_link('../english')
142
-
143
- case
144
- when !english_link.nil?
145
- puts "#{'success:'.green} found english link!"
146
- get(merged(english_link)) # already merged link
147
- else
148
- blind_test(test_en_page) || blind_test(test_english_page)
149
- puts 'ready to start again'
150
- contact_page
151
- end
152
- end
153
-
154
- # TODO: Sometimes DNS will do a redirect and not give a 404.
155
- # Need to prevent redirects.
156
- #
157
- # Blindly tests to see if a url goes through. If there is a 404 error,
158
- # this will return nil.
159
- def blind_test(url)
160
- puts "\nblind testing: #{url}"
161
- get(url)
162
- rescue Mechanize::ResponseCodeError
163
- puts "#{'404 Error:'.red} #{url}"
164
- end
165
-
166
- ##
167
- # Used in case of relative paths. Merging guarantees correct url.
168
- # This needs a url string as argument to work.
169
- # Produces a merged uri string.
170
- def merged_link(url_str)
171
- page.uri.merge(url_str).to_s
172
- end
173
-
174
- ##
175
- # Expects relative paths and merges everything.
176
- # Returns a string. If there's nothing, return nil.
177
- #
178
- # Add \b word block to ensure whole word is searched.
179
- def link_with_href(str)
180
- merged_link(page.link_with(href: /\b#{str}/).uri.to_s)
181
- rescue
182
- nil
183
- end
184
-
185
- ##
186
- # Returns anything that is possible to save, otherwise returns nil.
187
- # Booleans for phone, email, or contact form will display True or False.
188
- #
189
- # Add periods to link hrefs to prevent false positives. Must escape periods
190
- # with a backslash or else it will be a regex wild card.
191
- def scan_for_contacts
192
- {
193
- contactpage: link_with_href('contact'),
194
- email_present: "#{email_available?}",
195
- phone_present: "#{phone_available?}",
196
- contact_form: "#{contactform_available?}",
197
- facebook: link_with_href('facebook\.'),
198
- twitter: link_with_href('twitter\.'),
199
- youtube: link_with_href('youtube\.'),
200
- googleplus: link_with_href('plus\.google\.'),
201
- linkedin: link_with_href('linkedin\.')
202
- }
203
- end
204
-
205
- # Used in save_available_contacts to save each valid link.
206
- def save_link(key, url)
207
- return if key.nil? || url.nil?
208
- @contact_links[key] = url
209
- end
210
-
211
- ##
212
- # Remove negatives from the contacts hash.
213
- # Deletes a key value pair with a value of either nil or false.
214
- # Remember that false is a string.
215
- def delete_failures(hsh)
216
- hsh.delete_if { |_k, v| v.nil? || v == 'false' }
217
- end
218
-
219
- # Saves any available contact info to @contact_links.
220
- def save_available_contacts(url, hsh = scan_for_contacts)
221
- puts "\nsaving available contact information from #{url}"
222
- return unless something_to_save?
223
- if hsh.is_a?(Hash)
224
- hsh.each do |k, v|
225
- save_link(k, v) # saves to @contact_links
226
- end
227
- delete_failures(@contact_links)
228
- puts "#{@contact_links}".cyan # same as @contact_links
229
- else
230
- fail ArgumentError, "expected hash but got #{hsh.class}"
231
- end
232
- Search::POC.new(url, @contact_links)
233
- end
234
-
235
46
  # Convenience method.
236
47
  def memory
237
48
  Search.all_sites
@@ -243,21 +54,3 @@ module Gimme
243
54
  end
244
55
  end
245
56
  end
246
-
247
- # Collection of sites searched.
248
- class Search
249
- @all_sites = []
250
-
251
- class << self
252
- attr_accessor :all_sites
253
- end
254
-
255
- # Each site is saved to this class
256
- class POC
257
- def initialize(url, contact_info_hsh)
258
- @host = url
259
- @info = contact_info_hsh
260
- Search.all_sites << self
261
- end
262
- end
263
- end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gimme_poc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Mason
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-10-09 00:00:00.000000000 Z
11
+ date: 2015-10-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '1.2'
41
+ - !ruby/object:Gem::Dependency
42
+ name: lazy_domain
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.0.1
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.0.1
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rspec
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -91,7 +105,12 @@ files:
91
105
  - README.md
92
106
  - Rakefile
93
107
  - lib/gimme_poc.rb
108
+ - lib/gimme_poc/contactpage.rb
109
+ - lib/gimme_poc/poc.rb
110
+ - lib/gimme_poc/questions.rb
111
+ - lib/gimme_poc/save.rb
94
112
  - lib/gimme_poc/version.rb
113
+ - lib/gimme_poc/web.rb
95
114
  homepage: http://github.com/m8ss/gimme_poc
96
115
  licenses:
97
116
  - MIT