gimme_poc 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0e6506c640bcafe56d6906bfaae60f8a17e72865
4
- data.tar.gz: 21e4fc7176c2d5aa28a0fe31ab993889a7fbf74d
3
+ metadata.gz: c2765f0ef20a443c6c37e7ffc7f2cfa83454ec6b
4
+ data.tar.gz: 9761000cf90a2aba6549dfa774a5aff7f6daebff
5
5
  SHA512:
6
- metadata.gz: 39c35961983b31246c7414067d607bbc6c7a6dbddc5af9dfa97122ff094b881248db7a4a9c6415c33e393b3f59e5c632f0b06c3fe04e70dd9035892c5c2b11cf
7
- data.tar.gz: b1a194c4f0d51d7b3ea77f81786cc5b2d981b7c27f22dbeb6db98f8c1cca688437d04a1af5c9193d25e92072f88c664468d59ff89a2a853695e59809792ee457
6
+ metadata.gz: 48f94a4ecd20ab3db3197abcedecabeb8fb8a0fe7b4274fbff3d23bea395363bae69fe7bd3f42b56a5e701599e4780c367cae513b2ad95f9622167d99844011f
7
+ data.tar.gz: 132fad4c4fdc308a0cc3b630314525d17e197e0dcc568f5690ffdcfd351e434b8a9b2d5d1a0e9be613077b155c7cd51ed2f17c7a3ad1e2919121c88a0126dc78
data/README.md CHANGED
@@ -1,4 +1,5 @@
1
1
  # Gimme POC
2
+ [![Build Status](https://travis-ci.org/m8ss/gimme_poc.svg?branch=master)](https://travis-ci.org/m8ss/gimme_poc) [![Code Climate](https://codeclimate.com/github/m8ss/gimme_poc/badges/gpa.svg)](https://codeclimate.com/github/m8ss/gimme_poc) [![Gem Version](https://badge.fury.io/rb/gimme_poc.svg)](https://badge.fury.io/rb/gimme_poc)
2
3
 
3
4
  Gimme POC (Point of Contact) simplifies the process of extracting the common 'contact us' information from a website.
4
5
 
@@ -13,6 +14,13 @@ gem install gimme_poc
13
14
 
14
15
  ```
15
16
 
17
+ ## Set Up
18
+
19
+ ```ruby
20
+ require 'gimme_poc' # => that's it!
21
+
22
+ ```
23
+
16
24
  ## How it works
17
25
 
18
26
  Gimme POC is easy to use! Simply run this command.
@@ -60,7 +68,7 @@ Gimme.memory
60
68
 
61
69
  ## Clearing the search results
62
70
 
63
- To clear search results and start afresh, simply run:
71
+ To clear search results and start afresh, run:
64
72
 
65
73
  ```ruby
66
74
 
@@ -71,7 +79,7 @@ Gimme.reset!
71
79
  ## To do:
72
80
 
73
81
  - Convenience methods for returning specific information from all sites, (ie. just facebook or just twitter)
74
- - Work on false positives of bad urls. (DNS redirects don't give 404 errors)
82
+ - Work on false positives of bad urls. (Bad urls should be skipped + DNS redirects don't give 404 errors)
75
83
 
76
84
 
77
85
  More to follow...
data/Rakefile CHANGED
@@ -1,22 +1,6 @@
1
1
  require 'rubygems'
2
- require 'bundler'
3
-
4
- begin
5
- Bundler.setup(:default, :development)
6
- rescue Bundler::BundlerError => e
7
- $stderr.puts e.message
8
- $stderr.puts 'Run `bundle install` to install missing gems'
9
- exit e.status_code
10
- end
11
2
  require 'rake'
12
3
 
13
- require 'rake/testtask'
14
- Rake::TestTask.new(:test) do |test|
15
- test.libs << 'lib' << 'test'
16
- test.pattern = 'test/**/test_gimme_poc*.rb'
17
- test.verbose = true
18
- end
19
-
20
4
  desc 'Open console with gimme_poc loaded'
21
5
  task :console do
22
6
  exec 'pry -r ./lib/gimme_poc.rb'
@@ -1,3 +1,3 @@
1
1
  module Gimme
2
- VERSION = '0.0.1'
2
+ VERSION = '0.0.2'
3
3
  end
data/lib/gimme_poc.rb CHANGED
@@ -9,6 +9,9 @@ module Gimme
9
9
 
10
10
  # Simple regex that looks for ###.#### or ###-####
11
11
  PHONE_REGEX = /\d{3}[-]\d{4}|\d{3}[.]\d{4}/
12
+
13
+ # Captures http:// and https://
14
+ HTTP_REGEX = /\A\bhttps:\/\/|\bhttp:\/\//
12
15
 
13
16
  ## ----------------------------------------------------------------
14
17
  # Questions
@@ -16,51 +19,7 @@ module Gimme
16
19
  #
17
20
  #
18
21
 
19
- # Boolean, returns true if contact link is present.
20
- def contact_link?
21
- @url = link_with_href('contact')
22
-
23
- !@url.nil?
24
- end
25
-
26
- # True if contact page '../contact' does NOT get a 404 error.
27
- def contact_page?
28
- @url = page.uri.merge('../contact').to_s
29
- begin
30
- true if Mechanize.new.get(@url)
31
- rescue Mechanize::ResponseCodeError
32
- false
33
- end
34
- end
35
-
36
- # Boolean, returns true if link to English version is present.
37
- def english_link?
38
- return false if page.link_with(href: /english/).nil?
39
- @url = page.uri.merge(page.link_with(href: /english/).uri.to_s).to_s
40
-
41
- !@url.nil?
42
- end
43
-
44
- # True if english page '../en' does NOT get a 404 error.
45
- def en_page?
46
- @url = page.uri.merge('../en').to_s
47
- begin
48
- true if Mechanize.new.get(@url)
49
- rescue Mechanize::ResponseCodeError
50
- false
51
- end
52
- end
53
-
54
- # True if english page '../english' does NOT get a 404 error.
55
- def english_page?
56
- @url = page.uri.merge('../english').to_s
57
- begin
58
- true if Mechanize.new.get(@url)
59
- rescue Mechanize::ResponseCodeError
60
- false
61
- end
62
- end
63
-
22
+ ##
64
23
  # Boolean, returns true if anything is present
65
24
  # after running scan_for_contacts.
66
25
  def something_to_save?
@@ -77,8 +36,9 @@ module Gimme
77
36
  !(page.body =~ PHONE_REGEX).nil?
78
37
  end
79
38
 
39
+ ##
80
40
  # TODO: build better conditional to prevent false positives.
81
- # There could be other forms like newsletter signup, etc.
41
+ # There could be other forms like newsletter signup, etc.
82
42
  #
83
43
  # If there is a form with more than one field, this returns true.
84
44
  # Forms with one field are typically search boxes.
@@ -94,25 +54,43 @@ module Gimme
94
54
  #
95
55
  #
96
56
 
57
+ ##
97
58
  # The main method!
98
59
  # Takes array of urls and gets contact info for each if possible.
60
+ # If url is bad, it's converted to nil in 'get' method and skipped over.
99
61
  def poc(arr)
100
62
  arr = arr.split unless arr.is_a?(Array)
101
63
  arr.each do |url|
102
64
  puts '-' * 50
103
65
  puts "starting: #{url}"
104
- get(url)
66
+ next if get(url).nil?
67
+ puts "now looking for contact pages"
105
68
  start_contact_links
106
- go_to_contact_page
107
- save_available_contacts
69
+ mechpage = go_to_contact_page
70
+ save_available_contacts(mechpage.uri.to_s)
108
71
  end
109
72
  Search.all_sites
110
73
  end
74
+
75
+ # Mechanize needs absolute urls to work.
76
+ # If http:// or https:// isn't present, append http://.
77
+ def format_url(str)
78
+ str.prepend('http://') if (str =~ HTTP_REGEX).nil?
79
+ str
80
+ end
111
81
 
82
+ ##
112
83
  # Go to a page using Mechanize.
113
- def get(url)
114
- puts "sending GET request: #{url}..."
84
+ # Sleep for a split second to not overload any servers.
85
+ #
86
+ # Returns nil if bad url is given.
87
+ def get(str)
88
+ url = format_url(str)
89
+ puts "sending GET request to: #{url}"
90
+ sleep(0.1)
115
91
  @page = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }.get(url)
92
+ rescue SocketError => e
93
+ puts "#{'skipping:'.red} -- #{e}"
116
94
  end
117
95
 
118
96
  # Starts/Restarts @contacts_links hash
@@ -121,55 +99,107 @@ module Gimme
121
99
  @contact_links = {}
122
100
  end
123
101
 
102
+ ##
124
103
  # Scans for contact page. If it doesn't work on the first try,
125
- # It will look for english versions and try again.
104
+ # It will look for english versions and try again. Processes left to right.
126
105
  #
127
- # If contact page is found, go directly there and don't try again.
106
+ # Returns nil if no contact page can be found.
128
107
  def go_to_contact_page
129
- 1.times do
130
- case
131
- when contact_link?
132
- puts 'found contact link!'.green
133
- get(@url)
134
- when contact_page?
135
- puts 'found contact page!'.green
136
- get(@url)
137
- else
138
- if english_link? # look for link first, not check the page.
139
- puts 'found english link!'
140
- get(@url)
141
- redo
142
- elsif en_page?
143
- puts 'found en page!'
144
- get(@url)
145
- redo
146
- elsif english_page?
147
- puts 'found english page!'
148
- redo
149
- end
150
- end
108
+ contact_page || english_contact_page
109
+ end
110
+
111
+ ##
112
+ # Looks for contact page. Gets page if available.
113
+ # If no contact link is available, it will blind test '../contact'.
114
+ # Returns nil if nothing can be found.
115
+ def contact_page
116
+ contact_link = link_with_href(/contact|Contact/)
117
+ contact_test_page = merged_link('../contact')
118
+
119
+ case
120
+ when !contact_link.nil?
121
+ puts "#{'success:'.green} Found contact link!\n"
122
+ get(merged_link(contact_link))
123
+ else
124
+ puts "#{'warning:'.yellow}couldn't find contact link"
125
+ blind_test(contact_test_page)
126
+ end
127
+ end
128
+
129
+ ##
130
+ # Looks for english page. Gets page if available then looks for
131
+ # english contact page.
132
+ #
133
+ # If no english link is available,
134
+ # it will blind test '../en' and '../english'.
135
+ # Returns nil if nothing can be found.
136
+ def english_contact_page
137
+ puts "\nLooking for english page..."
138
+ english_link = page.link_with(href: /english|English/)
139
+ test_en_page = merged_link('../en')
140
+ test_english_page = merged_link('../english')
141
+
142
+ case
143
+ when !english_link.nil?
144
+ puts "#{'success:'.green} found english link!"
145
+ get(merged(english_link)) # already merged link
146
+ else
147
+ blind_test(test_en_page) || blind_test(test_english_page)
148
+ puts "ready to start again"
149
+ contact_page
151
150
  end
152
151
  end
153
152
 
153
+ # TODO: Sometimes DNS will do a redirect and not give a 404.
154
+ # Need to prevent redirects.
155
+ #
156
+ # Blindly tests to see if a url goes through. If there is a 404 error,
157
+ # this will return nil.
158
+ def blind_test(url)
159
+ begin
160
+ puts "\nblind testing: #{url}"
161
+ get(url)
162
+ rescue Mechanize::ResponseCodeError
163
+ puts "#{'404 Error:'.red} #{url}"
164
+ end
165
+ end
166
+
167
+ ##
168
+ # Used in case of relative paths. Merging guarantees correct url.
169
+ # This needs a url string as argument to work.
170
+ # Produces a merged uri string.
171
+ def merged_link(url_str)
172
+ page.uri.merge(url_str).to_s
173
+ end
174
+
175
+ ##
154
176
  # Expects relative paths and merges everything.
155
177
  # Returns a string. If there's nothing, return nil.
178
+ #
179
+ # Add \b word block to ensure whole word is searched.
156
180
  def link_with_href(str)
157
- page.uri.merge(page.link_with(href: /#{str}/).uri.to_s).to_s rescue nil
181
+ merged_link(page.link_with(href: /\b#{str}/).uri.to_s)
182
+ rescue
183
+ nil
158
184
  end
159
185
 
186
+ ##
160
187
  # Returns anything that is possible to save, otherwise returns nil.
161
188
  # Booleans for phone, email, or contact form will display True or False.
189
+ #
190
+ # Add periods to link hrefs to prevent false positives. Must escape periods
191
+ # with a backslash or else it will be a regex wild card.
162
192
  def scan_for_contacts
163
193
  {
164
194
  contactpage: link_with_href('contact'),
165
195
  email_present: "#{email_available?}",
166
196
  phone_present: "#{phone_available?}",
167
197
  contact_form: "#{contactform_available?}",
168
- facebook: link_with_href('facebook'),
169
- twitter: link_with_href('twitter'),
170
- youtube: link_with_href('youtube'),
171
- googleplus: link_with_href('plus.google'),
172
- linkedin: link_with_href('linkedin')
198
+ facebook: link_with_href('facebook\.'),
199
+ twitter: link_with_href('twitter\.'),
200
+ youtube: link_with_href('youtube\.'),
201
+ googleplus: link_with_href('plus\.google\.'),
202
+ linkedin: link_with_href('linkedin\.')
173
203
  }
174
204
  end
175
205
 
@@ -179,6 +209,7 @@ module Gimme
179
209
  @contact_links[key] = url
180
210
  end
181
211
 
212
+ ##
182
213
  # Remove negatives from the contacts hash.
183
214
  # Deletes a key value pair with a value of either nil or false.
184
215
  # Remember that false is a string.
@@ -187,8 +218,8 @@ module Gimme
187
218
  end
188
219
 
189
220
  # Saves any available contact info to @contact_links.
190
- def save_available_contacts(hsh = scan_for_contacts)
191
- puts 'saving available contact information...'
221
+ def save_available_contacts(url, hsh = scan_for_contacts)
222
+ puts "\nsaving available contact information from #{url}"
192
223
  return unless something_to_save?
193
224
  if hsh.is_a?(Hash)
194
225
  hsh.each do |k, v|
@@ -199,10 +230,10 @@ module Gimme
199
230
  else
200
231
  fail ArgumentError, "expected hash but got #{hsh.class}"
201
232
  end
202
- Search::POC.new(@contact_links)
233
+ Search::POC.new(url, @contact_links)
203
234
  end
204
235
 
205
- # Convenience method
236
+ # Convenience method.
206
237
  def memory
207
238
  Search.all_sites
208
239
  end
@@ -214,7 +245,7 @@ module Gimme
214
245
  end
215
246
  end
216
247
 
217
- # Collectino of sites searched
248
+ # Collection of sites searched.
218
249
  class Search
219
250
  @all_sites = []
220
251
 
@@ -224,8 +255,9 @@ class Search
224
255
 
225
256
  # Each site is saved to this class
226
257
  class POC
227
- def initialize(contact_info_hsh)
228
- @contact_info = contact_info_hsh
258
+ def initialize(url, contact_info_hsh)
259
+ @host = url
260
+ @info = contact_info_hsh
229
261
  Search.all_sites << self
230
262
  end
231
263
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: gimme_poc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Mason
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '1.2'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.3'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.3'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: pry
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -98,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
98
112
  version: '0'
99
113
  requirements: []
100
114
  rubyforge_project:
101
- rubygems_version: 2.4.5
115
+ rubygems_version: 2.4.3
102
116
  signing_key:
103
117
  specification_version: 4
104
118
  summary: Get a point of contact. Given a url or array of urls, extracts social media