gimme_poc 0.0.0.beta → 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +76 -2
- data/lib/gimme_poc/version.rb +2 -2
- data/lib/gimme_poc.rb +91 -61
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0e6506c640bcafe56d6906bfaae60f8a17e72865
|
4
|
+
data.tar.gz: 21e4fc7176c2d5aa28a0fe31ab993889a7fbf74d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 39c35961983b31246c7414067d607bbc6c7a6dbddc5af9dfa97122ff094b881248db7a4a9c6415c33e393b3f59e5c632f0b06c3fe04e70dd9035892c5c2b11cf
|
7
|
+
data.tar.gz: b1a194c4f0d51d7b3ea77f81786cc5b2d981b7c27f22dbeb6db98f8c1cca688437d04a1af5c9193d25e92072f88c664468d59ff89a2a853695e59809792ee457
|
data/README.md
CHANGED
@@ -1,3 +1,77 @@
|
|
1
|
-
|
1
|
+
# Gimme POC
|
2
2
|
|
3
|
-
|
3
|
+
Gimme POC (Point of Contact) simplifies the process of extracting the common 'contact us' information from a website.
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
Gimme POC simply looks for a contact page and extracts social media contact information, if present. Due to CAN-SPAM Act of 2003, emails are not harvested. Instead Gimme POC will return True or False if an email or contact form is available.
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
```
|
12
|
+
gem install gimme_poc
|
13
|
+
|
14
|
+
```
|
15
|
+
|
16
|
+
## How it works
|
17
|
+
|
18
|
+
Gimme POC is easy to use! Simply run this command.
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
|
22
|
+
Gimme.poc 'http://example.com'
|
23
|
+
|
24
|
+
# => returns this:
|
25
|
+
#
|
26
|
+
# {
|
27
|
+
# :contactpage=>"http://example.com/contact/",
|
28
|
+
# :phone_present?=>"true",
|
29
|
+
# :contact_form?=>"true",
|
30
|
+
# :facebook=>"http://www.facebook.com/example",
|
31
|
+
# :twitter=>"http://twitter.com/@example",
|
32
|
+
# :googleplus=>"http://plus.google.com/+example",
|
33
|
+
# :linkedin=>"http://www.linkedin.com/in/example"
|
34
|
+
# }
|
35
|
+
#
|
36
|
+
#
|
37
|
+
|
38
|
+
|
39
|
+
```
|
40
|
+
|
41
|
+
## Searching more than one site
|
42
|
+
|
43
|
+
You also have the ability to pass multiple urls in the form of array. For example, you could run the command below and get contact information from each site all at once.
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
|
47
|
+
Gimme.poc(['http://example.com', 'http://foo.com', 'http://bar.com'])
|
48
|
+
|
49
|
+
```
|
50
|
+
|
51
|
+
## Referencing the search results
|
52
|
+
|
53
|
+
To use your search results, simply run:
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
|
57
|
+
Gimme.memory
|
58
|
+
|
59
|
+
```
|
60
|
+
|
61
|
+
## Clearing the search results
|
62
|
+
|
63
|
+
To clear search results and start afresh, simply run:
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
|
67
|
+
Gimme.reset!
|
68
|
+
|
69
|
+
```
|
70
|
+
|
71
|
+
## To do:
|
72
|
+
|
73
|
+
- Convenience methods for returning specific information from all sites, (ie. just facebook or just twitter)
|
74
|
+
- Work on false positives of bad urls. (DNS redirects don't give 404 errors)
|
75
|
+
|
76
|
+
|
77
|
+
More to follow...
|
data/lib/gimme_poc/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
1
|
module Gimme
|
2
|
-
VERSION =
|
3
|
-
end
|
2
|
+
VERSION = '0.0.1'
|
3
|
+
end
|
data/lib/gimme_poc.rb
CHANGED
@@ -6,23 +6,24 @@ require_relative './gimme_poc/version'
|
|
6
6
|
module Gimme
|
7
7
|
class << self
|
8
8
|
attr_accessor :page, :contact, :contact_links, :url
|
9
|
-
|
9
|
+
|
10
10
|
# Simple regex that looks for ###.#### or ###-####
|
11
11
|
PHONE_REGEX = /\d{3}[-]\d{4}|\d{3}[.]\d{4}/
|
12
|
-
|
12
|
+
|
13
13
|
## ----------------------------------------------------------------
|
14
|
-
# Questions
|
14
|
+
# Questions
|
15
15
|
#
|
16
16
|
#
|
17
17
|
#
|
18
|
-
|
18
|
+
|
19
19
|
# Boolean, returns true if contact link is present.
|
20
20
|
def contact_link?
|
21
21
|
@url = link_with_href('contact')
|
22
|
-
|
22
|
+
|
23
|
+
!@url.nil?
|
23
24
|
end
|
24
|
-
|
25
|
-
#
|
25
|
+
|
26
|
+
# True if contact page '../contact' does NOT get a 404 error.
|
26
27
|
def contact_page?
|
27
28
|
@url = page.uri.merge('../contact').to_s
|
28
29
|
begin
|
@@ -31,16 +32,16 @@ module Gimme
|
|
31
32
|
false
|
32
33
|
end
|
33
34
|
end
|
34
|
-
|
35
35
|
|
36
36
|
# Boolean, returns true if link to English version is present.
|
37
37
|
def english_link?
|
38
38
|
return false if page.link_with(href: /english/).nil?
|
39
39
|
@url = page.uri.merge(page.link_with(href: /english/).uri.to_s).to_s
|
40
|
-
|
40
|
+
|
41
|
+
!@url.nil?
|
41
42
|
end
|
42
|
-
|
43
|
-
#
|
43
|
+
|
44
|
+
# True if english page '../en' does NOT get a 404 error.
|
44
45
|
def en_page?
|
45
46
|
@url = page.uri.merge('../en').to_s
|
46
47
|
begin
|
@@ -50,8 +51,7 @@ module Gimme
|
|
50
51
|
end
|
51
52
|
end
|
52
53
|
|
53
|
-
|
54
|
-
# Check if english page '../english' gets a 404 error.
|
54
|
+
# True if english page '../english' does NOT get a 404 error.
|
55
55
|
def english_page?
|
56
56
|
@url = page.uri.merge('../english').to_s
|
57
57
|
begin
|
@@ -66,18 +66,17 @@ module Gimme
|
|
66
66
|
def something_to_save?
|
67
67
|
scan_for_contacts.any?
|
68
68
|
end
|
69
|
-
|
69
|
+
|
70
70
|
# Boolean, returns true if email is present.
|
71
71
|
def email_available?
|
72
|
-
|
72
|
+
!link_with_href('mailto').nil?
|
73
73
|
end
|
74
|
-
|
75
|
-
|
74
|
+
|
76
75
|
# Boolean, returns true if phone number is present.
|
77
76
|
def phone_available?
|
78
|
-
|
77
|
+
!(page.body =~ PHONE_REGEX).nil?
|
79
78
|
end
|
80
|
-
|
79
|
+
|
81
80
|
# TODO: build better conditional to prevent false positives.
|
82
81
|
# There could be other forms like newsletter signup, etc.
|
83
82
|
#
|
@@ -86,11 +85,11 @@ module Gimme
|
|
86
85
|
#
|
87
86
|
# Boolean, returns true if form is present on page.
|
88
87
|
def contactform_available?
|
89
|
-
|
88
|
+
!(page.forms.select { |x| x.fields.length > 1 }.empty?)
|
90
89
|
end
|
91
|
-
|
90
|
+
|
92
91
|
## ----------------------------------------------------------------
|
93
|
-
# Actions
|
92
|
+
# Actions
|
94
93
|
#
|
95
94
|
#
|
96
95
|
#
|
@@ -100,68 +99,72 @@ module Gimme
|
|
100
99
|
def poc(arr)
|
101
100
|
arr = arr.split unless arr.is_a?(Array)
|
102
101
|
arr.each do |url|
|
102
|
+
puts '-' * 50
|
103
|
+
puts "starting: #{url}"
|
103
104
|
get(url)
|
104
105
|
start_contact_links
|
105
106
|
go_to_contact_page
|
106
107
|
save_available_contacts
|
107
108
|
end
|
108
|
-
|
109
|
-
puts 'press RETURN'
|
110
|
-
gets
|
111
|
-
p @contacts_links # need to add to an overall array at some point.
|
109
|
+
Search.all_sites
|
112
110
|
end
|
113
|
-
|
114
|
-
# Go to a page using Mechanize.
|
111
|
+
|
112
|
+
# Go to a page using Mechanize.
|
115
113
|
def get(url)
|
116
|
-
puts "
|
117
|
-
@page = Mechanize.new.get(url)
|
114
|
+
puts "sending GET request: #{url}..."
|
115
|
+
@page = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }.get(url)
|
118
116
|
end
|
119
|
-
|
117
|
+
|
120
118
|
# Starts/Restarts @contacts_links hash
|
121
119
|
def start_contact_links
|
122
|
-
puts
|
120
|
+
puts 'setting contact links hash to {}'
|
123
121
|
@contact_links = {}
|
124
122
|
end
|
125
|
-
|
126
|
-
# Scans for contact page. If it doesn't work on the first try,
|
123
|
+
|
124
|
+
# Scans for contact page. If it doesn't work on the first try,
|
127
125
|
# It will look for english versions and try again.
|
128
126
|
#
|
129
127
|
# If contact page is found, go directly there and don't try again.
|
130
128
|
def go_to_contact_page
|
131
129
|
1.times do
|
132
|
-
|
133
|
-
|
130
|
+
case
|
131
|
+
when contact_link?
|
132
|
+
puts 'found contact link!'.green
|
134
133
|
get(@url)
|
135
|
-
|
136
|
-
puts
|
134
|
+
when contact_page?
|
135
|
+
puts 'found contact page!'.green
|
137
136
|
get(@url)
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
137
|
+
else
|
138
|
+
if english_link? # look for link first, not check the page.
|
139
|
+
puts 'found english link!'
|
140
|
+
get(@url)
|
141
|
+
redo
|
142
|
+
elsif en_page?
|
143
|
+
puts 'found en page!'
|
144
|
+
get(@url)
|
145
|
+
redo
|
146
|
+
elsif english_page?
|
147
|
+
puts 'found english page!'
|
148
|
+
redo
|
149
|
+
end
|
147
150
|
end
|
148
151
|
end
|
149
152
|
end
|
150
153
|
|
151
|
-
# Expects relative paths and merges everything.
|
154
|
+
# Expects relative paths and merges everything.
|
152
155
|
# Returns a string. If there's nothing, return nil.
|
153
156
|
def link_with_href(str)
|
154
157
|
page.uri.merge(page.link_with(href: /#{str}/).uri.to_s).to_s rescue nil
|
155
|
-
end
|
158
|
+
end
|
156
159
|
|
157
160
|
# Returns anything that is possible to save, otherwise returns nil.
|
158
161
|
# Booleans for phone, email, or contact form will display True or False.
|
159
162
|
def scan_for_contacts
|
160
163
|
{
|
161
164
|
contactpage: link_with_href('contact'),
|
162
|
-
email_present
|
163
|
-
phone_present
|
164
|
-
contact_form
|
165
|
+
email_present: "#{email_available?}",
|
166
|
+
phone_present: "#{phone_available?}",
|
167
|
+
contact_form: "#{contactform_available?}",
|
165
168
|
facebook: link_with_href('facebook'),
|
166
169
|
twitter: link_with_href('twitter'),
|
167
170
|
youtube: link_with_href('youtube'),
|
@@ -175,28 +178,55 @@ module Gimme
|
|
175
178
|
return if key.nil? || url.nil?
|
176
179
|
@contact_links[key] = url
|
177
180
|
end
|
178
|
-
|
181
|
+
|
179
182
|
# Remove negatives from the contacts hash.
|
180
183
|
# Deletes a key value pair with a value of either nil or false.
|
181
184
|
# Remember that false is a string.
|
182
185
|
def delete_failures(hsh)
|
183
|
-
hsh.delete_if {|
|
186
|
+
hsh.delete_if { |_k, v| v.nil? || v == 'false' }
|
184
187
|
end
|
185
188
|
|
186
189
|
# Saves any available contact info to @contact_links.
|
187
190
|
def save_available_contacts(hsh = scan_for_contacts)
|
188
|
-
puts '
|
191
|
+
puts 'saving available contact information...'
|
189
192
|
return unless something_to_save?
|
190
193
|
if hsh.is_a?(Hash)
|
191
194
|
hsh.each do |k, v|
|
192
|
-
save_link(k, v)
|
195
|
+
save_link(k, v) # saves to @contact_links
|
193
196
|
end
|
194
|
-
delete_failures(
|
195
|
-
puts "
|
196
|
-
|
197
|
-
|
198
|
-
raise ArgumentError, "expected hash but got #{hsh.class}"
|
197
|
+
delete_failures(@contact_links)
|
198
|
+
puts "#{@contact_links}".cyan # same as @contact_links
|
199
|
+
else
|
200
|
+
fail ArgumentError, "expected hash but got #{hsh.class}"
|
199
201
|
end
|
202
|
+
Search::POC.new(@contact_links)
|
203
|
+
end
|
204
|
+
|
205
|
+
# Convenience method
|
206
|
+
def memory
|
207
|
+
Search.all_sites
|
208
|
+
end
|
209
|
+
|
210
|
+
# Clears entire collection.
|
211
|
+
def reset!
|
212
|
+
Search.all_sites = []
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
216
|
+
|
217
|
+
# Collectino of sites searched
|
218
|
+
class Search
|
219
|
+
@all_sites = []
|
220
|
+
|
221
|
+
class << self
|
222
|
+
attr_accessor :all_sites
|
223
|
+
end
|
224
|
+
|
225
|
+
# Each site is saved to this class
|
226
|
+
class POC
|
227
|
+
def initialize(contact_info_hsh)
|
228
|
+
@contact_info = contact_info_hsh
|
229
|
+
Search.all_sites << self
|
200
230
|
end
|
201
231
|
end
|
202
232
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: gimme_poc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Mason
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-10-
|
11
|
+
date: 2015-10-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -93,12 +93,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
93
93
|
version: '0'
|
94
94
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
95
95
|
requirements:
|
96
|
-
- - "
|
96
|
+
- - ">="
|
97
97
|
- !ruby/object:Gem::Version
|
98
|
-
version:
|
98
|
+
version: '0'
|
99
99
|
requirements: []
|
100
100
|
rubyforge_project:
|
101
|
-
rubygems_version: 2.4.
|
101
|
+
rubygems_version: 2.4.5
|
102
102
|
signing_key:
|
103
103
|
specification_version: 4
|
104
104
|
summary: Get a point of contact. Given a url or array of urls, extracts social media
|