gimme_poc 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +10 -2
- data/Rakefile +0 -16
- data/lib/gimme_poc/version.rb +1 -1
- data/lib/gimme_poc.rb +120 -88
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c2765f0ef20a443c6c37e7ffc7f2cfa83454ec6b
|
4
|
+
data.tar.gz: 9761000cf90a2aba6549dfa774a5aff7f6daebff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48f94a4ecd20ab3db3197abcedecabeb8fb8a0fe7b4274fbff3d23bea395363bae69fe7bd3f42b56a5e701599e4780c367cae513b2ad95f9622167d99844011f
|
7
|
+
data.tar.gz: 132fad4c4fdc308a0cc3b630314525d17e197e0dcc568f5690ffdcfd351e434b8a9b2d5d1a0e9be613077b155c7cd51ed2f17c7a3ad1e2919121c88a0126dc78
|
data/README.md
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# Gimme POC
|
2
|
+
[![Build Status](https://travis-ci.org/m8ss/gimme_poc.svg?branch=master)](https://travis-ci.org/m8ss/gimme_poc) [![Code Climate](https://codeclimate.com/github/m8ss/gimme_poc/badges/gpa.svg)](https://codeclimate.com/github/m8ss/gimme_poc) [![Gem Version](https://badge.fury.io/rb/gimme_poc.svg)](https://badge.fury.io/rb/gimme_poc)
|
2
3
|
|
3
4
|
Gimme POC (Point of Contact) simplifies the process of extracting the common 'contact us' information from a website.
|
4
5
|
|
@@ -13,6 +14,13 @@ gem install gimme_poc
|
|
13
14
|
|
14
15
|
```
|
15
16
|
|
17
|
+
## Set Up
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
require 'gimme_poc' # => that's it!
|
21
|
+
|
22
|
+
```
|
23
|
+
|
16
24
|
## How it works
|
17
25
|
|
18
26
|
Gimme POC is easy to use! Simply run this command.
|
@@ -60,7 +68,7 @@ Gimme.memory
|
|
60
68
|
|
61
69
|
## Clearing the search results
|
62
70
|
|
63
|
-
To clear search results and start afresh,
|
71
|
+
To clear search results and start afresh, run:
|
64
72
|
|
65
73
|
```ruby
|
66
74
|
|
@@ -71,7 +79,7 @@ Gimme.reset!
|
|
71
79
|
## To do:
|
72
80
|
|
73
81
|
- Convenience methods for returning specific information from all sites, (ie. just facebook or just twitter)
|
74
|
-
- Work on false positives of bad urls. (DNS redirects don't give 404 errors)
|
82
|
+
- Work on false positives of bad urls. (Bad urls should be skipped + DNS redirects don't give 404 errors)
|
75
83
|
|
76
84
|
|
77
85
|
More to follow...
|
data/Rakefile
CHANGED
@@ -1,22 +1,6 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require 'bundler'
|
3
|
-
|
4
|
-
begin
|
5
|
-
Bundler.setup(:default, :development)
|
6
|
-
rescue Bundler::BundlerError => e
|
7
|
-
$stderr.puts e.message
|
8
|
-
$stderr.puts 'Run `bundle install` to install missing gems'
|
9
|
-
exit e.status_code
|
10
|
-
end
|
11
2
|
require 'rake'
|
12
3
|
|
13
|
-
require 'rake/testtask'
|
14
|
-
Rake::TestTask.new(:test) do |test|
|
15
|
-
test.libs << 'lib' << 'test'
|
16
|
-
test.pattern = 'test/**/test_gimme_poc*.rb'
|
17
|
-
test.verbose = true
|
18
|
-
end
|
19
|
-
|
20
4
|
desc 'Open console with gimme_poc loaded'
|
21
5
|
task :console do
|
22
6
|
exec 'pry -r ./lib/gimme_poc.rb'
|
data/lib/gimme_poc/version.rb
CHANGED
data/lib/gimme_poc.rb
CHANGED
@@ -9,6 +9,9 @@ module Gimme
|
|
9
9
|
|
10
10
|
# Simple regex that looks for ###.#### or ###-####
|
11
11
|
PHONE_REGEX = /\d{3}[-]\d{4}|\d{3}[.]\d{4}/
|
12
|
+
|
13
|
+
# Captures http:// and https://
|
14
|
+
HTTP_REGEX = /\A\bhttps:\/\/|\bhttp:\/\//
|
12
15
|
|
13
16
|
## ----------------------------------------------------------------
|
14
17
|
# Questions
|
@@ -16,51 +19,7 @@ module Gimme
|
|
16
19
|
#
|
17
20
|
#
|
18
21
|
|
19
|
-
|
20
|
-
def contact_link?
|
21
|
-
@url = link_with_href('contact')
|
22
|
-
|
23
|
-
!@url.nil?
|
24
|
-
end
|
25
|
-
|
26
|
-
# True if contact page '../contact' does NOT get a 404 error.
|
27
|
-
def contact_page?
|
28
|
-
@url = page.uri.merge('../contact').to_s
|
29
|
-
begin
|
30
|
-
true if Mechanize.new.get(@url)
|
31
|
-
rescue Mechanize::ResponseCodeError
|
32
|
-
false
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
# Boolean, returns true if link to English version is present.
|
37
|
-
def english_link?
|
38
|
-
return false if page.link_with(href: /english/).nil?
|
39
|
-
@url = page.uri.merge(page.link_with(href: /english/).uri.to_s).to_s
|
40
|
-
|
41
|
-
!@url.nil?
|
42
|
-
end
|
43
|
-
|
44
|
-
# True if english page '../en' does NOT get a 404 error.
|
45
|
-
def en_page?
|
46
|
-
@url = page.uri.merge('../en').to_s
|
47
|
-
begin
|
48
|
-
true if Mechanize.new.get(@url)
|
49
|
-
rescue Mechanize::ResponseCodeError
|
50
|
-
false
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
# True if english page '../english' does NOT get a 404 error.
|
55
|
-
def english_page?
|
56
|
-
@url = page.uri.merge('../english').to_s
|
57
|
-
begin
|
58
|
-
true if Mechanize.new.get(@url)
|
59
|
-
rescue Mechanize::ResponseCodeError
|
60
|
-
false
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
22
|
+
##
|
64
23
|
# Boolean, returns true if anything is present
|
65
24
|
# after running scan_for_contacts.
|
66
25
|
def something_to_save?
|
@@ -77,8 +36,9 @@ module Gimme
|
|
77
36
|
!(page.body =~ PHONE_REGEX).nil?
|
78
37
|
end
|
79
38
|
|
39
|
+
##
|
80
40
|
# TODO: build better conditional to prevent false positives.
|
81
|
-
#
|
41
|
+
# There could be other forms like newsletter signup, etc.
|
82
42
|
#
|
83
43
|
# If there is a form with more than one field, this returns true.
|
84
44
|
# Forms with one field are typically search boxes.
|
@@ -94,25 +54,43 @@ module Gimme
|
|
94
54
|
#
|
95
55
|
#
|
96
56
|
|
57
|
+
##
|
97
58
|
# The main method!
|
98
59
|
# Takes array of urls and gets contact info for each if possible.
|
60
|
+
# If url is bad, it's converted to nil in 'get' method and skipped over.
|
99
61
|
def poc(arr)
|
100
62
|
arr = arr.split unless arr.is_a?(Array)
|
101
63
|
arr.each do |url|
|
102
64
|
puts '-' * 50
|
103
65
|
puts "starting: #{url}"
|
104
|
-
get(url)
|
66
|
+
next if get(url).nil?
|
67
|
+
puts "now looking for contact pages"
|
105
68
|
start_contact_links
|
106
|
-
go_to_contact_page
|
107
|
-
save_available_contacts
|
69
|
+
mechpage = go_to_contact_page
|
70
|
+
save_available_contacts(mechpage.uri.to_s)
|
108
71
|
end
|
109
72
|
Search.all_sites
|
110
73
|
end
|
74
|
+
|
75
|
+
# Mechanize needs absolute urls to work.
|
76
|
+
# If http:// or https:// isn't present, append http://.
|
77
|
+
def format_url(str)
|
78
|
+
str.prepend('http://') if (str =~ HTTP_REGEX).nil?
|
79
|
+
str
|
80
|
+
end
|
111
81
|
|
82
|
+
##
|
112
83
|
# Go to a page using Mechanize.
|
113
|
-
|
114
|
-
|
84
|
+
# Sleep for a split second to not overload any servers.
|
85
|
+
#
|
86
|
+
# Returns nil if bad url is given.
|
87
|
+
def get(str)
|
88
|
+
url = format_url(str)
|
89
|
+
puts "sending GET request to: #{url}"
|
90
|
+
sleep(0.1)
|
115
91
|
@page = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }.get(url)
|
92
|
+
rescue SocketError => e
|
93
|
+
puts "#{'skipping:'.red} -- #{e}"
|
116
94
|
end
|
117
95
|
|
118
96
|
# Starts/Restarts @contacts_links hash
|
@@ -121,55 +99,107 @@ module Gimme
|
|
121
99
|
@contact_links = {}
|
122
100
|
end
|
123
101
|
|
102
|
+
##
|
124
103
|
# Scans for contact page. If it doesn't work on the first try,
|
125
|
-
# It will look for english versions and try again.
|
104
|
+
# It will look for english versions and try again. Processes left to right.
|
126
105
|
#
|
127
|
-
#
|
106
|
+
# Returns nil if no contact page can be found.
|
128
107
|
def go_to_contact_page
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
108
|
+
contact_page || english_contact_page
|
109
|
+
end
|
110
|
+
|
111
|
+
##
|
112
|
+
# Looks for contact page. Gets page if available.
|
113
|
+
# If no contact link is available, it will blind test '../contact'.
|
114
|
+
# Returns nil if nothing can be found.
|
115
|
+
def contact_page
|
116
|
+
contact_link = link_with_href(/contact|Contact/)
|
117
|
+
contact_test_page = merged_link('../contact')
|
118
|
+
|
119
|
+
case
|
120
|
+
when !contact_link.nil?
|
121
|
+
puts "#{'success:'.green} Found contact link!\n"
|
122
|
+
get(merged_link(contact_link))
|
123
|
+
else
|
124
|
+
puts "#{'warning:'.yellow}couldn't find contact link"
|
125
|
+
blind_test(contact_test_page)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
##
|
130
|
+
# Looks for english page. Gets page if available then looks for
|
131
|
+
# english contact page.
|
132
|
+
#
|
133
|
+
# If no english link is available,
|
134
|
+
# it will blind test '../en' and '../english'.
|
135
|
+
# Returns nil if nothing can be found.
|
136
|
+
def english_contact_page
|
137
|
+
puts "\nLooking for english page..."
|
138
|
+
english_link = page.link_with(href: /english|English/)
|
139
|
+
test_en_page = merged_link('../en')
|
140
|
+
test_english_page = merged_link('../english')
|
141
|
+
|
142
|
+
case
|
143
|
+
when !english_link.nil?
|
144
|
+
puts "#{'success:'.green} found english link!"
|
145
|
+
get(merged(english_link)) # already merged link
|
146
|
+
else
|
147
|
+
blind_test(test_en_page) || blind_test(test_english_page)
|
148
|
+
puts "ready to start again"
|
149
|
+
contact_page
|
151
150
|
end
|
152
151
|
end
|
153
152
|
|
153
|
+
# TODO: Sometimes DNS will do a redirect and not give a 404.
|
154
|
+
# Need to prevent redirects.
|
155
|
+
#
|
156
|
+
# Blindly tests to see if a url goes through. If there is a 404 error,
|
157
|
+
# this will return nil.
|
158
|
+
def blind_test(url)
|
159
|
+
begin
|
160
|
+
puts "\nblind testing: #{url}"
|
161
|
+
get(url)
|
162
|
+
rescue Mechanize::ResponseCodeError
|
163
|
+
puts "#{'404 Error:'.red} #{url}"
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
##
|
168
|
+
# Used in case of relative paths. Merging guarantees correct url.
|
169
|
+
# This needs a url string as argument to work.
|
170
|
+
# Produces a merged uri string.
|
171
|
+
def merged_link(url_str)
|
172
|
+
page.uri.merge(url_str).to_s
|
173
|
+
end
|
174
|
+
|
175
|
+
##
|
154
176
|
# Expects relative paths and merges everything.
|
155
177
|
# Returns a string. If there's nothing, return nil.
|
178
|
+
#
|
179
|
+
# Add \b word block to ensure whole word is searched.
|
156
180
|
def link_with_href(str)
|
157
|
-
|
181
|
+
merged_link(page.link_with(href: /\b#{str}/).uri.to_s)
|
182
|
+
rescue
|
183
|
+
nil
|
158
184
|
end
|
159
185
|
|
186
|
+
##
|
160
187
|
# Returns anything that is possible to save, otherwise returns nil.
|
161
188
|
# Booleans for phone, email, or contact form will display True or False.
|
189
|
+
#
|
190
|
+
# Add periods to link hrefs to prevent false positives. Must escape periods
|
191
|
+
# with a backslash or else it will be a regex wild card.
|
162
192
|
def scan_for_contacts
|
163
193
|
{
|
164
194
|
contactpage: link_with_href('contact'),
|
165
195
|
email_present: "#{email_available?}",
|
166
196
|
phone_present: "#{phone_available?}",
|
167
197
|
contact_form: "#{contactform_available?}",
|
168
|
-
facebook: link_with_href('facebook'),
|
169
|
-
twitter: link_with_href('twitter'),
|
170
|
-
youtube: link_with_href('youtube'),
|
171
|
-
googleplus: link_with_href('plus
|
172
|
-
linkedin: link_with_href('linkedin')
|
198
|
+
facebook: link_with_href('facebook\.'),
|
199
|
+
twitter: link_with_href('twitter\.'),
|
200
|
+
youtube: link_with_href('youtube\.'),
|
201
|
+
googleplus: link_with_href('plus\.google\.'),
|
202
|
+
linkedin: link_with_href('linkedin\.')
|
173
203
|
}
|
174
204
|
end
|
175
205
|
|
@@ -179,6 +209,7 @@ module Gimme
|
|
179
209
|
@contact_links[key] = url
|
180
210
|
end
|
181
211
|
|
212
|
+
##
|
182
213
|
# Remove negatives from the contacts hash.
|
183
214
|
# Deletes a key value pair with a value of either nil or false.
|
184
215
|
# Remember that false is a string.
|
@@ -187,8 +218,8 @@ module Gimme
|
|
187
218
|
end
|
188
219
|
|
189
220
|
# Saves any available contact info to @contact_links.
|
190
|
-
def save_available_contacts(hsh = scan_for_contacts)
|
191
|
-
puts
|
221
|
+
def save_available_contacts(url, hsh = scan_for_contacts)
|
222
|
+
puts "\nsaving available contact information from #{url}"
|
192
223
|
return unless something_to_save?
|
193
224
|
if hsh.is_a?(Hash)
|
194
225
|
hsh.each do |k, v|
|
@@ -199,10 +230,10 @@ module Gimme
|
|
199
230
|
else
|
200
231
|
fail ArgumentError, "expected hash but got #{hsh.class}"
|
201
232
|
end
|
202
|
-
Search::POC.new(@contact_links)
|
233
|
+
Search::POC.new(url, @contact_links)
|
203
234
|
end
|
204
235
|
|
205
|
-
# Convenience method
|
236
|
+
# Convenience method.
|
206
237
|
def memory
|
207
238
|
Search.all_sites
|
208
239
|
end
|
@@ -214,7 +245,7 @@ module Gimme
|
|
214
245
|
end
|
215
246
|
end
|
216
247
|
|
217
|
-
#
|
248
|
+
# Collection of sites searched.
|
218
249
|
class Search
|
219
250
|
@all_sites = []
|
220
251
|
|
@@ -224,8 +255,9 @@ class Search
|
|
224
255
|
|
225
256
|
# Each site is saved to this class
|
226
257
|
class POC
|
227
|
-
def initialize(contact_info_hsh)
|
228
|
-
@
|
258
|
+
def initialize(url, contact_info_hsh)
|
259
|
+
@host = url
|
260
|
+
@info = contact_info_hsh
|
229
261
|
Search.all_sites << self
|
230
262
|
end
|
231
263
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gimme_poc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Mason
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '1.2'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.3'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.3'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: pry
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -98,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
98
112
|
version: '0'
|
99
113
|
requirements: []
|
100
114
|
rubyforge_project:
|
101
|
-
rubygems_version: 2.4.
|
115
|
+
rubygems_version: 2.4.3
|
102
116
|
signing_key:
|
103
117
|
specification_version: 4
|
104
118
|
summary: Get a point of contact. Given a url or array of urls, extracts social media
|