gimme_poc 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +10 -2
- data/Rakefile +0 -16
- data/lib/gimme_poc/version.rb +1 -1
- data/lib/gimme_poc.rb +120 -88
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c2765f0ef20a443c6c37e7ffc7f2cfa83454ec6b
|
4
|
+
data.tar.gz: 9761000cf90a2aba6549dfa774a5aff7f6daebff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48f94a4ecd20ab3db3197abcedecabeb8fb8a0fe7b4274fbff3d23bea395363bae69fe7bd3f42b56a5e701599e4780c367cae513b2ad95f9622167d99844011f
|
7
|
+
data.tar.gz: 132fad4c4fdc308a0cc3b630314525d17e197e0dcc568f5690ffdcfd351e434b8a9b2d5d1a0e9be613077b155c7cd51ed2f17c7a3ad1e2919121c88a0126dc78
|
data/README.md
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# Gimme POC
|
2
|
+
[](https://travis-ci.org/m8ss/gimme_poc) [](https://codeclimate.com/github/m8ss/gimme_poc) [](https://badge.fury.io/rb/gimme_poc)
|
2
3
|
|
3
4
|
Gimme POC (Point of Contact) simplifies the process of extracting the common 'contact us' information from a website.
|
4
5
|
|
@@ -13,6 +14,13 @@ gem install gimme_poc
|
|
13
14
|
|
14
15
|
```
|
15
16
|
|
17
|
+
## Set Up
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
require 'gimme_poc' # => that's it!
|
21
|
+
|
22
|
+
```
|
23
|
+
|
16
24
|
## How it works
|
17
25
|
|
18
26
|
Gimme POC is easy to use! Simply run this command.
|
@@ -60,7 +68,7 @@ Gimme.memory
|
|
60
68
|
|
61
69
|
## Clearing the search results
|
62
70
|
|
63
|
-
To clear search results and start afresh,
|
71
|
+
To clear search results and start afresh, run:
|
64
72
|
|
65
73
|
```ruby
|
66
74
|
|
@@ -71,7 +79,7 @@ Gimme.reset!
|
|
71
79
|
## To do:
|
72
80
|
|
73
81
|
- Convenience methods for returning specific information from all sites, (ie. just facebook or just twitter)
|
74
|
-
- Work on false positives of bad urls. (DNS redirects don't give 404 errors)
|
82
|
+
- Work on false positives of bad urls. (Bad urls should be skipped + DNS redirects don't give 404 errors)
|
75
83
|
|
76
84
|
|
77
85
|
More to follow...
|
data/Rakefile
CHANGED
@@ -1,22 +1,6 @@
|
|
1
1
|
require 'rubygems'
|
2
|
-
require 'bundler'
|
3
|
-
|
4
|
-
begin
|
5
|
-
Bundler.setup(:default, :development)
|
6
|
-
rescue Bundler::BundlerError => e
|
7
|
-
$stderr.puts e.message
|
8
|
-
$stderr.puts 'Run `bundle install` to install missing gems'
|
9
|
-
exit e.status_code
|
10
|
-
end
|
11
2
|
require 'rake'
|
12
3
|
|
13
|
-
require 'rake/testtask'
|
14
|
-
Rake::TestTask.new(:test) do |test|
|
15
|
-
test.libs << 'lib' << 'test'
|
16
|
-
test.pattern = 'test/**/test_gimme_poc*.rb'
|
17
|
-
test.verbose = true
|
18
|
-
end
|
19
|
-
|
20
4
|
desc 'Open console with gimme_poc loaded'
|
21
5
|
task :console do
|
22
6
|
exec 'pry -r ./lib/gimme_poc.rb'
|
data/lib/gimme_poc/version.rb
CHANGED
data/lib/gimme_poc.rb
CHANGED
@@ -9,6 +9,9 @@ module Gimme
|
|
9
9
|
|
10
10
|
# Simple regex that looks for ###.#### or ###-####
|
11
11
|
PHONE_REGEX = /\d{3}[-]\d{4}|\d{3}[.]\d{4}/
|
12
|
+
|
13
|
+
# Captures http:// and https://
|
14
|
+
HTTP_REGEX = /\A\bhttps:\/\/|\bhttp:\/\//
|
12
15
|
|
13
16
|
## ----------------------------------------------------------------
|
14
17
|
# Questions
|
@@ -16,51 +19,7 @@ module Gimme
|
|
16
19
|
#
|
17
20
|
#
|
18
21
|
|
19
|
-
|
20
|
-
def contact_link?
|
21
|
-
@url = link_with_href('contact')
|
22
|
-
|
23
|
-
!@url.nil?
|
24
|
-
end
|
25
|
-
|
26
|
-
# True if contact page '../contact' does NOT get a 404 error.
|
27
|
-
def contact_page?
|
28
|
-
@url = page.uri.merge('../contact').to_s
|
29
|
-
begin
|
30
|
-
true if Mechanize.new.get(@url)
|
31
|
-
rescue Mechanize::ResponseCodeError
|
32
|
-
false
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
# Boolean, returns true if link to English version is present.
|
37
|
-
def english_link?
|
38
|
-
return false if page.link_with(href: /english/).nil?
|
39
|
-
@url = page.uri.merge(page.link_with(href: /english/).uri.to_s).to_s
|
40
|
-
|
41
|
-
!@url.nil?
|
42
|
-
end
|
43
|
-
|
44
|
-
# True if english page '../en' does NOT get a 404 error.
|
45
|
-
def en_page?
|
46
|
-
@url = page.uri.merge('../en').to_s
|
47
|
-
begin
|
48
|
-
true if Mechanize.new.get(@url)
|
49
|
-
rescue Mechanize::ResponseCodeError
|
50
|
-
false
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
# True if english page '../english' does NOT get a 404 error.
|
55
|
-
def english_page?
|
56
|
-
@url = page.uri.merge('../english').to_s
|
57
|
-
begin
|
58
|
-
true if Mechanize.new.get(@url)
|
59
|
-
rescue Mechanize::ResponseCodeError
|
60
|
-
false
|
61
|
-
end
|
62
|
-
end
|
63
|
-
|
22
|
+
##
|
64
23
|
# Boolean, returns true if anything is present
|
65
24
|
# after running scan_for_contacts.
|
66
25
|
def something_to_save?
|
@@ -77,8 +36,9 @@ module Gimme
|
|
77
36
|
!(page.body =~ PHONE_REGEX).nil?
|
78
37
|
end
|
79
38
|
|
39
|
+
##
|
80
40
|
# TODO: build better conditional to prevent false positives.
|
81
|
-
#
|
41
|
+
# There could be other forms like newsletter signup, etc.
|
82
42
|
#
|
83
43
|
# If there is a form with more than one field, this returns true.
|
84
44
|
# Forms with one field are typically search boxes.
|
@@ -94,25 +54,43 @@ module Gimme
|
|
94
54
|
#
|
95
55
|
#
|
96
56
|
|
57
|
+
##
|
97
58
|
# The main method!
|
98
59
|
# Takes array of urls and gets contact info for each if possible.
|
60
|
+
# If url is bad, it's converted to nil in 'get' method and skipped over.
|
99
61
|
def poc(arr)
|
100
62
|
arr = arr.split unless arr.is_a?(Array)
|
101
63
|
arr.each do |url|
|
102
64
|
puts '-' * 50
|
103
65
|
puts "starting: #{url}"
|
104
|
-
get(url)
|
66
|
+
next if get(url).nil?
|
67
|
+
puts "now looking for contact pages"
|
105
68
|
start_contact_links
|
106
|
-
go_to_contact_page
|
107
|
-
save_available_contacts
|
69
|
+
mechpage = go_to_contact_page
|
70
|
+
save_available_contacts(mechpage.uri.to_s)
|
108
71
|
end
|
109
72
|
Search.all_sites
|
110
73
|
end
|
74
|
+
|
75
|
+
# Mechanize needs absolute urls to work.
|
76
|
+
# If http:// or https:// isn't present, append http://.
|
77
|
+
def format_url(str)
|
78
|
+
str.prepend('http://') if (str =~ HTTP_REGEX).nil?
|
79
|
+
str
|
80
|
+
end
|
111
81
|
|
82
|
+
##
|
112
83
|
# Go to a page using Mechanize.
|
113
|
-
|
114
|
-
|
84
|
+
# Sleep for a split second to not overload any servers.
|
85
|
+
#
|
86
|
+
# Returns nil if bad url is given.
|
87
|
+
def get(str)
|
88
|
+
url = format_url(str)
|
89
|
+
puts "sending GET request to: #{url}"
|
90
|
+
sleep(0.1)
|
115
91
|
@page = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }.get(url)
|
92
|
+
rescue SocketError => e
|
93
|
+
puts "#{'skipping:'.red} -- #{e}"
|
116
94
|
end
|
117
95
|
|
118
96
|
# Starts/Restarts @contacts_links hash
|
@@ -121,55 +99,107 @@ module Gimme
|
|
121
99
|
@contact_links = {}
|
122
100
|
end
|
123
101
|
|
102
|
+
##
|
124
103
|
# Scans for contact page. If it doesn't work on the first try,
|
125
|
-
# It will look for english versions and try again.
|
104
|
+
# It will look for english versions and try again. Processes left to right.
|
126
105
|
#
|
127
|
-
#
|
106
|
+
# Returns nil if no contact page can be found.
|
128
107
|
def go_to_contact_page
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
108
|
+
contact_page || english_contact_page
|
109
|
+
end
|
110
|
+
|
111
|
+
##
|
112
|
+
# Looks for contact page. Gets page if available.
|
113
|
+
# If no contact link is available, it will blind test '../contact'.
|
114
|
+
# Returns nil if nothing can be found.
|
115
|
+
def contact_page
|
116
|
+
contact_link = link_with_href(/contact|Contact/)
|
117
|
+
contact_test_page = merged_link('../contact')
|
118
|
+
|
119
|
+
case
|
120
|
+
when !contact_link.nil?
|
121
|
+
puts "#{'success:'.green} Found contact link!\n"
|
122
|
+
get(merged_link(contact_link))
|
123
|
+
else
|
124
|
+
puts "#{'warning:'.yellow}couldn't find contact link"
|
125
|
+
blind_test(contact_test_page)
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
##
|
130
|
+
# Looks for english page. Gets page if available then looks for
|
131
|
+
# english contact page.
|
132
|
+
#
|
133
|
+
# If no english link is available,
|
134
|
+
# it will blind test '../en' and '../english'.
|
135
|
+
# Returns nil if nothing can be found.
|
136
|
+
def english_contact_page
|
137
|
+
puts "\nLooking for english page..."
|
138
|
+
english_link = page.link_with(href: /english|English/)
|
139
|
+
test_en_page = merged_link('../en')
|
140
|
+
test_english_page = merged_link('../english')
|
141
|
+
|
142
|
+
case
|
143
|
+
when !english_link.nil?
|
144
|
+
puts "#{'success:'.green} found english link!"
|
145
|
+
get(merged(english_link)) # already merged link
|
146
|
+
else
|
147
|
+
blind_test(test_en_page) || blind_test(test_english_page)
|
148
|
+
puts "ready to start again"
|
149
|
+
contact_page
|
151
150
|
end
|
152
151
|
end
|
153
152
|
|
153
|
+
# TODO: Sometimes DNS will do a redirect and not give a 404.
|
154
|
+
# Need to prevent redirects.
|
155
|
+
#
|
156
|
+
# Blindly tests to see if a url goes through. If there is a 404 error,
|
157
|
+
# this will return nil.
|
158
|
+
def blind_test(url)
|
159
|
+
begin
|
160
|
+
puts "\nblind testing: #{url}"
|
161
|
+
get(url)
|
162
|
+
rescue Mechanize::ResponseCodeError
|
163
|
+
puts "#{'404 Error:'.red} #{url}"
|
164
|
+
end
|
165
|
+
end
|
166
|
+
|
167
|
+
##
|
168
|
+
# Used in case of relative paths. Merging guarantees correct url.
|
169
|
+
# This needs a url string as argument to work.
|
170
|
+
# Produces a merged uri string.
|
171
|
+
def merged_link(url_str)
|
172
|
+
page.uri.merge(url_str).to_s
|
173
|
+
end
|
174
|
+
|
175
|
+
##
|
154
176
|
# Expects relative paths and merges everything.
|
155
177
|
# Returns a string. If there's nothing, return nil.
|
178
|
+
#
|
179
|
+
# Add \b word block to ensure whole word is searched.
|
156
180
|
def link_with_href(str)
|
157
|
-
|
181
|
+
merged_link(page.link_with(href: /\b#{str}/).uri.to_s)
|
182
|
+
rescue
|
183
|
+
nil
|
158
184
|
end
|
159
185
|
|
186
|
+
##
|
160
187
|
# Returns anything that is possible to save, otherwise returns nil.
|
161
188
|
# Booleans for phone, email, or contact form will display True or False.
|
189
|
+
#
|
190
|
+
# Add periods to link hrefs to prevent false positives. Must escape periods
|
191
|
+
# with a backslash or else it will be a regex wild card.
|
162
192
|
def scan_for_contacts
|
163
193
|
{
|
164
194
|
contactpage: link_with_href('contact'),
|
165
195
|
email_present: "#{email_available?}",
|
166
196
|
phone_present: "#{phone_available?}",
|
167
197
|
contact_form: "#{contactform_available?}",
|
168
|
-
facebook: link_with_href('facebook'),
|
169
|
-
twitter: link_with_href('twitter'),
|
170
|
-
youtube: link_with_href('youtube'),
|
171
|
-
googleplus: link_with_href('plus
|
172
|
-
linkedin: link_with_href('linkedin')
|
198
|
+
facebook: link_with_href('facebook\.'),
|
199
|
+
twitter: link_with_href('twitter\.'),
|
200
|
+
youtube: link_with_href('youtube\.'),
|
201
|
+
googleplus: link_with_href('plus\.google\.'),
|
202
|
+
linkedin: link_with_href('linkedin\.')
|
173
203
|
}
|
174
204
|
end
|
175
205
|
|
@@ -179,6 +209,7 @@ module Gimme
|
|
179
209
|
@contact_links[key] = url
|
180
210
|
end
|
181
211
|
|
212
|
+
##
|
182
213
|
# Remove negatives from the contacts hash.
|
183
214
|
# Deletes a key value pair with a value of either nil or false.
|
184
215
|
# Remember that false is a string.
|
@@ -187,8 +218,8 @@ module Gimme
|
|
187
218
|
end
|
188
219
|
|
189
220
|
# Saves any available contact info to @contact_links.
|
190
|
-
def save_available_contacts(hsh = scan_for_contacts)
|
191
|
-
puts
|
221
|
+
def save_available_contacts(url, hsh = scan_for_contacts)
|
222
|
+
puts "\nsaving available contact information from #{url}"
|
192
223
|
return unless something_to_save?
|
193
224
|
if hsh.is_a?(Hash)
|
194
225
|
hsh.each do |k, v|
|
@@ -199,10 +230,10 @@ module Gimme
|
|
199
230
|
else
|
200
231
|
fail ArgumentError, "expected hash but got #{hsh.class}"
|
201
232
|
end
|
202
|
-
Search::POC.new(@contact_links)
|
233
|
+
Search::POC.new(url, @contact_links)
|
203
234
|
end
|
204
235
|
|
205
|
-
# Convenience method
|
236
|
+
# Convenience method.
|
206
237
|
def memory
|
207
238
|
Search.all_sites
|
208
239
|
end
|
@@ -214,7 +245,7 @@ module Gimme
|
|
214
245
|
end
|
215
246
|
end
|
216
247
|
|
217
|
-
#
|
248
|
+
# Collection of sites searched.
|
218
249
|
class Search
|
219
250
|
@all_sites = []
|
220
251
|
|
@@ -224,8 +255,9 @@ class Search
|
|
224
255
|
|
225
256
|
# Each site is saved to this class
|
226
257
|
class POC
|
227
|
-
def initialize(contact_info_hsh)
|
228
|
-
@
|
258
|
+
def initialize(url, contact_info_hsh)
|
259
|
+
@host = url
|
260
|
+
@info = contact_info_hsh
|
229
261
|
Search.all_sites << self
|
230
262
|
end
|
231
263
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gimme_poc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Mason
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '1.2'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.3'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.3'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: pry
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -98,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
98
112
|
version: '0'
|
99
113
|
requirements: []
|
100
114
|
rubyforge_project:
|
101
|
-
rubygems_version: 2.4.
|
115
|
+
rubygems_version: 2.4.3
|
102
116
|
signing_key:
|
103
117
|
specification_version: 4
|
104
118
|
summary: Get a point of contact. Given a url or array of urls, extracts social media
|