gimme_poc 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0e6506c640bcafe56d6906bfaae60f8a17e72865
4
- data.tar.gz: 21e4fc7176c2d5aa28a0fe31ab993889a7fbf74d
3
+ metadata.gz: c2765f0ef20a443c6c37e7ffc7f2cfa83454ec6b
4
+ data.tar.gz: 9761000cf90a2aba6549dfa774a5aff7f6daebff
5
5
  SHA512:
6
- metadata.gz: 39c35961983b31246c7414067d607bbc6c7a6dbddc5af9dfa97122ff094b881248db7a4a9c6415c33e393b3f59e5c632f0b06c3fe04e70dd9035892c5c2b11cf
7
- data.tar.gz: b1a194c4f0d51d7b3ea77f81786cc5b2d981b7c27f22dbeb6db98f8c1cca688437d04a1af5c9193d25e92072f88c664468d59ff89a2a853695e59809792ee457
6
+ metadata.gz: 48f94a4ecd20ab3db3197abcedecabeb8fb8a0fe7b4274fbff3d23bea395363bae69fe7bd3f42b56a5e701599e4780c367cae513b2ad95f9622167d99844011f
7
+ data.tar.gz: 132fad4c4fdc308a0cc3b630314525d17e197e0dcc568f5690ffdcfd351e434b8a9b2d5d1a0e9be613077b155c7cd51ed2f17c7a3ad1e2919121c88a0126dc78
data/README.md CHANGED
@@ -1,4 +1,5 @@
1
1
  # Gimme POC
2
+ [![Build Status](https://travis-ci.org/m8ss/gimme_poc.svg?branch=master)](https://travis-ci.org/m8ss/gimme_poc) [![Code Climate](https://codeclimate.com/github/m8ss/gimme_poc/badges/gpa.svg)](https://codeclimate.com/github/m8ss/gimme_poc) [![Gem Version](https://badge.fury.io/rb/gimme_poc.svg)](https://badge.fury.io/rb/gimme_poc)
2
3
 
3
4
  Gimme POC (Point of Contact) simplifies the process of extracting the common 'contact us' information from a website.
4
5
 
@@ -13,6 +14,13 @@ gem install gimme_poc
13
14
 
14
15
  ```
15
16
 
17
+ ## Set Up
18
+
19
+ ```ruby
20
+ require 'gimme_poc' # => that's it!
21
+
22
+ ```
23
+
16
24
  ## How it works
17
25
 
18
26
  Gimme POC is easy to use! Simply run this command.
@@ -60,7 +68,7 @@ Gimme.memory
60
68
 
61
69
  ## Clearing the search results
62
70
 
63
- To clear search results and start afresh, simply run:
71
+ To clear search results and start afresh, run:
64
72
 
65
73
  ```ruby
66
74
 
@@ -71,7 +79,7 @@ Gimme.reset!
71
79
  ## To do:
72
80
 
73
81
  - Convenience methods for returning specific information from all sites, (ie. just facebook or just twitter)
74
- - Work on false positives of bad urls. (DNS redirects don't give 404 errors)
82
+ - Work on false positives of bad urls. (Bad urls should be skipped + DNS redirects don't give 404 errors)
75
83
 
76
84
 
77
85
  More to follow...
data/Rakefile CHANGED
@@ -1,22 +1,6 @@
1
1
  require 'rubygems'
2
- require 'bundler'
3
-
4
- begin
5
- Bundler.setup(:default, :development)
6
- rescue Bundler::BundlerError => e
7
- $stderr.puts e.message
8
- $stderr.puts 'Run `bundle install` to install missing gems'
9
- exit e.status_code
10
- end
11
2
  require 'rake'
12
3
 
13
- require 'rake/testtask'
14
- Rake::TestTask.new(:test) do |test|
15
- test.libs << 'lib' << 'test'
16
- test.pattern = 'test/**/test_gimme_poc*.rb'
17
- test.verbose = true
18
- end
19
-
20
4
  desc 'Open console with gimme_poc loaded'
21
5
  task :console do
22
6
  exec 'pry -r ./lib/gimme_poc.rb'
@@ -1,3 +1,3 @@
1
1
  module Gimme
2
- VERSION = '0.0.1'
2
+ VERSION = '0.0.2'
3
3
  end
data/lib/gimme_poc.rb CHANGED
@@ -9,6 +9,9 @@ module Gimme
9
9
 
10
10
  # Simple regex that looks for ###.#### or ###-####
11
11
  PHONE_REGEX = /\d{3}[-]\d{4}|\d{3}[.]\d{4}/
12
+
13
+ # Captures http:// and https://
14
+ HTTP_REGEX = /\A\bhttps:\/\/|\bhttp:\/\//
12
15
 
13
16
  ## ----------------------------------------------------------------
14
17
  # Questions
@@ -16,51 +19,7 @@ module Gimme
16
19
  #
17
20
  #
18
21
 
19
- # Boolean, returns true if contact link is present.
20
- def contact_link?
21
- @url = link_with_href('contact')
22
-
23
- !@url.nil?
24
- end
25
-
26
- # True if contact page '../contact' does NOT get a 404 error.
27
- def contact_page?
28
- @url = page.uri.merge('../contact').to_s
29
- begin
30
- true if Mechanize.new.get(@url)
31
- rescue Mechanize::ResponseCodeError
32
- false
33
- end
34
- end
35
-
36
- # Boolean, returns true if link to English version is present.
37
- def english_link?
38
- return false if page.link_with(href: /english/).nil?
39
- @url = page.uri.merge(page.link_with(href: /english/).uri.to_s).to_s
40
-
41
- !@url.nil?
42
- end
43
-
44
- # True if english page '../en' does NOT get a 404 error.
45
- def en_page?
46
- @url = page.uri.merge('../en').to_s
47
- begin
48
- true if Mechanize.new.get(@url)
49
- rescue Mechanize::ResponseCodeError
50
- false
51
- end
52
- end
53
-
54
- # True if english page '../english' does NOT get a 404 error.
55
- def english_page?
56
- @url = page.uri.merge('../english').to_s
57
- begin
58
- true if Mechanize.new.get(@url)
59
- rescue Mechanize::ResponseCodeError
60
- false
61
- end
62
- end
63
-
22
+ ##
64
23
  # Boolean, returns true if anything is present
65
24
  # after running scan_for_contacts.
66
25
  def something_to_save?
@@ -77,8 +36,9 @@ module Gimme
77
36
  !(page.body =~ PHONE_REGEX).nil?
78
37
  end
79
38
 
39
+ ##
80
40
  # TODO: build better conditional to prevent false positives.
81
- # There could be other forms like newsletter signup, etc.
41
+ # There could be other forms like newsletter signup, etc.
82
42
  #
83
43
  # If there is a form with more than one field, this returns true.
84
44
  # Forms with one field are typically search boxes.
@@ -94,25 +54,43 @@ module Gimme
94
54
  #
95
55
  #
96
56
 
57
+ ##
97
58
  # The main method!
98
59
  # Takes array of urls and gets contact info for each if possible.
60
+ # If url is bad, it's converted to nil in 'get' method and skipped over.
99
61
  def poc(arr)
100
62
  arr = arr.split unless arr.is_a?(Array)
101
63
  arr.each do |url|
102
64
  puts '-' * 50
103
65
  puts "starting: #{url}"
104
- get(url)
66
+ next if get(url).nil?
67
+ puts "now looking for contact pages"
105
68
  start_contact_links
106
- go_to_contact_page
107
- save_available_contacts
69
+ mechpage = go_to_contact_page
70
+ save_available_contacts(mechpage.uri.to_s)
108
71
  end
109
72
  Search.all_sites
110
73
  end
74
+
75
+ # Mechanize needs absolute urls to work.
76
+ # If http:// or https:// isn't present, append http://.
77
+ def format_url(str)
78
+ str.prepend('http://') if (str =~ HTTP_REGEX).nil?
79
+ str
80
+ end
111
81
 
82
+ ##
112
83
  # Go to a page using Mechanize.
113
- def get(url)
114
- puts "sending GET request: #{url}..."
84
+ # Sleep for a split second to not overload any servers.
85
+ #
86
+ # Returns nil if bad url is given.
87
+ def get(str)
88
+ url = format_url(str)
89
+ puts "sending GET request to: #{url}"
90
+ sleep(0.1)
115
91
  @page = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }.get(url)
92
+ rescue SocketError => e
93
+ puts "#{'skipping:'.red} -- #{e}"
116
94
  end
117
95
 
118
96
  # Starts/Restarts @contacts_links hash
@@ -121,55 +99,107 @@ module Gimme
121
99
  @contact_links = {}
122
100
  end
123
101
 
102
+ ##
124
103
  # Scans for contact page. If it doesn't work on the first try,
125
- # It will look for english versions and try again.
104
+ # It will look for english versions and try again. Processes left to right.
126
105
  #
127
- # If contact page is found, go directly there and don't try again.
106
+ # Returns nil if no contact page can be found.
128
107
  def go_to_contact_page
129
- 1.times do
130
- case
131
- when contact_link?
132
- puts 'found contact link!'.green
133
- get(@url)
134
- when contact_page?
135
- puts 'found contact page!'.green
136
- get(@url)
137
- else
138
- if english_link? # look for link first, not check the page.
139
- puts 'found english link!'
140
- get(@url)
141
- redo
142
- elsif en_page?
143
- puts 'found en page!'
144
- get(@url)
145
- redo
146
- elsif english_page?
147
- puts 'found english page!'
148
- redo
149
- end
150
- end
108
+ contact_page || english_contact_page
109
+ end
110
+
111
+ ##
112
+ # Looks for contact page. Gets page if available.
113
+ # If no contact link is available, it will blind test '../contact'.
114
+ # Returns nil if nothing can be found.
115
+ def contact_page
116
+ contact_link = link_with_href(/contact|Contact/)
117
+ contact_test_page = merged_link('../contact')
118
+
119
+ case
120
+ when !contact_link.nil?
121
+ puts "#{'success:'.green} Found contact link!\n"
122
+ get(merged_link(contact_link))
123
+ else
124
+ puts "#{'warning:'.yellow}couldn't find contact link"
125
+ blind_test(contact_test_page)
126
+ end
127
+ end
128
+
129
+ ##
130
+ # Looks for english page. Gets page if available then looks for
131
+ # english contact page.
132
+ #
133
+ # If no english link is available,
134
+ # it will blind test '../en' and '../english'.
135
+ # Returns nil if nothing can be found.
136
+ def english_contact_page
137
+ puts "\nLooking for english page..."
138
+ english_link = page.link_with(href: /english|English/)
139
+ test_en_page = merged_link('../en')
140
+ test_english_page = merged_link('../english')
141
+
142
+ case
143
+ when !english_link.nil?
144
+ puts "#{'success:'.green} found english link!"
145
+ get(merged(english_link)) # already merged link
146
+ else
147
+ blind_test(test_en_page) || blind_test(test_english_page)
148
+ puts "ready to start again"
149
+ contact_page
151
150
  end
152
151
  end
153
152
 
153
+ # TODO: Sometimes DNS will do a redirect and not give a 404.
154
+ # Need to prevent redirects.
155
+ #
156
+ # Blindly tests to see if a url goes through. If there is a 404 error,
157
+ # this will return nil.
158
+ def blind_test(url)
159
+ begin
160
+ puts "\nblind testing: #{url}"
161
+ get(url)
162
+ rescue Mechanize::ResponseCodeError
163
+ puts "#{'404 Error:'.red} #{url}"
164
+ end
165
+ end
166
+
167
+ ##
168
+ # Used in case of relative paths. Merging guarantees correct url.
169
+ # This needs a url string as argument to work.
170
+ # Produces a merged uri string.
171
+ def merged_link(url_str)
172
+ page.uri.merge(url_str).to_s
173
+ end
174
+
175
+ ##
154
176
  # Expects relative paths and merges everything.
155
177
  # Returns a string. If there's nothing, return nil.
178
+ #
179
+ # Add \b word block to ensure whole word is searched.
156
180
  def link_with_href(str)
157
- page.uri.merge(page.link_with(href: /#{str}/).uri.to_s).to_s rescue nil
181
+ merged_link(page.link_with(href: /\b#{str}/).uri.to_s)
182
+ rescue
183
+ nil
158
184
  end
159
185
 
186
+ ##
160
187
  # Returns anything that is possible to save, otherwise returns nil.
161
188
  # Booleans for phone, email, or contact form will display True or False.
189
+ #
190
+ # Add periods to link hrefs to prevent false positives. Must escape periods
191
+ # with a backslash or else it will be a regex wild card.
162
192
  def scan_for_contacts
163
193
  {
164
194
  contactpage: link_with_href('contact'),
165
195
  email_present: "#{email_available?}",
166
196
  phone_present: "#{phone_available?}",
167
197
  contact_form: "#{contactform_available?}",
168
- facebook: link_with_href('facebook'),
169
- twitter: link_with_href('twitter'),
170
- youtube: link_with_href('youtube'),
171
- googleplus: link_with_href('plus.google'),
172
- linkedin: link_with_href('linkedin')
198
+ facebook: link_with_href('facebook\.'),
199
+ twitter: link_with_href('twitter\.'),
200
+ youtube: link_with_href('youtube\.'),
201
+ googleplus: link_with_href('plus\.google\.'),
202
+ linkedin: link_with_href('linkedin\.')
173
203
  }
174
204
  end
175
205
 
@@ -179,6 +209,7 @@ module Gimme
179
209
  @contact_links[key] = url
180
210
  end
181
211
 
212
+ ##
182
213
  # Remove negatives from the contacts hash.
183
214
  # Deletes a key value pair with a value of either nil or false.
184
215
  # Remember that false is a string.
@@ -187,8 +218,8 @@ module Gimme
187
218
  end
188
219
 
189
220
  # Saves any available contact info to @contact_links.
190
- def save_available_contacts(hsh = scan_for_contacts)
191
- puts 'saving available contact information...'
221
+ def save_available_contacts(url, hsh = scan_for_contacts)
222
+ puts "\nsaving available contact information from #{url}"
192
223
  return unless something_to_save?
193
224
  if hsh.is_a?(Hash)
194
225
  hsh.each do |k, v|
@@ -199,10 +230,10 @@ module Gimme
199
230
  else
200
231
  fail ArgumentError, "expected hash but got #{hsh.class}"
201
232
  end
202
- Search::POC.new(@contact_links)
233
+ Search::POC.new(url, @contact_links)
203
234
  end
204
235
 
205
- # Convenience method
236
+ # Convenience method.
206
237
  def memory
207
238
  Search.all_sites
208
239
  end
@@ -214,7 +245,7 @@ module Gimme
214
245
  end
215
246
  end
216
247
 
217
- # Collectino of sites searched
248
+ # Collection of sites searched.
218
249
  class Search
219
250
  @all_sites = []
220
251
 
@@ -224,8 +255,9 @@ class Search
224
255
 
225
256
  # Each site is saved to this class
226
257
  class POC
227
- def initialize(contact_info_hsh)
228
- @contact_info = contact_info_hsh
258
+ def initialize(url, contact_info_hsh)
259
+ @host = url
260
+ @info = contact_info_hsh
229
261
  Search.all_sites << self
230
262
  end
231
263
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gimme_poc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Mason
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '1.2'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.3'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.3'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: pry
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -98,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
98
112
  version: '0'
99
113
  requirements: []
100
114
  rubyforge_project:
101
- rubygems_version: 2.4.5
115
+ rubygems_version: 2.4.3
102
116
  signing_key:
103
117
  specification_version: 4
104
118
  summary: Get a point of contact. Given a url or array of urls, extracts social media