ubi 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +3 -0
- data/Gemfile +11 -0
- data/Guardfile +16 -0
- data/MIT-LICENSE +19 -0
- data/README.md +57 -0
- data/Rakefile +9 -0
- data/bin/ubi +10 -0
- data/lib/ubi.rb +37 -0
- data/lib/ubi/aranea.rb +42 -0
- data/lib/ubi/artifex.rb +28 -0
- data/lib/ubi/consultor.rb +61 -0
- data/lib/ubi/consultores/bing.rb +26 -0
- data/lib/ubi/consultores/duckduckgo.rb +26 -0
- data/lib/ubi/consultores/facebook.rb +6 -0
- data/lib/ubi/consultores/foursquare.rb +6 -0
- data/lib/ubi/consultores/google.rb +24 -0
- data/lib/ubi/consultores/linkedin.rb +0 -0
- data/lib/ubi/consultores/twitter.rb +6 -0
- data/lib/ubi/consultores/wikipedia.rb +6 -0
- data/lib/ubi/consultores/yahoo.rb +26 -0
- data/lib/ubi/datum.rb +43 -0
- data/lib/ubi/impero.rb +20 -0
- data/lib/ubi/memoria.rb +72 -0
- data/lib/ubi/memorias/address.rb +71 -0
- data/lib/ubi/memorias/document.rb +50 -0
- data/lib/ubi/memorias/email.rb +19 -0
- data/lib/ubi/memorias/phone.rb +33 -0
- data/lib/ubi/memorias/site.rb +29 -0
- data/lib/ubi/memorias/social.rb +20 -0
- data/lib/ubi/memorias/who.rb +20 -0
- data/lib/ubi/thema.rb +62 -0
- data/lib/ubi/version.rb +4 -0
- data/spec/fixtures/email.txt +5 -0
- data/spec/fixtures/mobile.txt +17 -0
- data/spec/fixtures/page.txt +21 -0
- data/spec/fixtures/phone.txt +17 -0
- data/spec/fixtures/site.txt +21 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/ubi/aranea_spec.rb +19 -0
- data/spec/ubi/artifex_spec.rb +4 -0
- data/spec/ubi/memorias/address_spec.rb +56 -0
- data/spec/ubi/memorias/document_spec.rb +48 -0
- data/spec/ubi/memorias/email_spec.rb +59 -0
- data/spec/ubi/memorias/phone_spec.rb +79 -0
- data/spec/ubi/memorias/site_spec.rb +82 -0
- data/spec/ubi/thema_spec.rb +33 -0
- data/ubi.gemspec +39 -0
- metadata +232 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 330dfff07aab154b3fc714a71b10cf0f148d541d
|
4
|
+
data.tar.gz: 81ff08e9a049671a6f2c51c43641007466fa30c3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fae80b06b1e6f5bac23357f8b4c09ef45879ba62b4e1e6e90c9e42a16490f7496cc614b27538beaed7d6529d8784fce29a79722e4a351379079cc0e4be6bf1cb
|
7
|
+
data.tar.gz: 4a4ec085b39e27a0e64addcf9c22fff3a969bdff21a72c8684bcb500451d3c4503eb1935383fd96715dd72257082f4851476b9b2c8d4a26244b6363b9264588b
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/Guardfile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
#
|
2
|
+
# Ubi Guard
|
3
|
+
#
|
4
|
+
guard :rubocop, all_on_start: false, keep_failed: false, cli: ['-D'] do
|
5
|
+
watch(/.+\.rb$/)
|
6
|
+
watch(/(?:.+\/)?\.rubocop\.yml$/) { |m| File.dirname(m[0]) }
|
7
|
+
end
|
8
|
+
|
9
|
+
guard :rspec, cmd: 'bundle exec rspec' do
|
10
|
+
watch(/^spec\/.+_spec\.rb$/)
|
11
|
+
watch(/^lib\/(.+)\.rb$/) { |m| "spec/#{m[1]}_spec.rb" }
|
12
|
+
watch(/^generators\/(.+)\.rb$/) { |_m| 'spec/schemaless/worker_spec' }
|
13
|
+
|
14
|
+
watch('lib/ubi.rb') { 'spec' }
|
15
|
+
watch('spec/spec_helper.rb') { 'spec' }
|
16
|
+
end
|
data/MIT-LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2014-2015 Ubi Authors
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a
|
4
|
+
copy of this software and associated documentation files (the "Software"),
|
5
|
+
to deal in the Software without restriction, including without limitation
|
6
|
+
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
7
|
+
and/or sell copies of the Software, and to permit persons to whom the
|
8
|
+
Software is furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be
|
11
|
+
included in all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
14
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
15
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
16
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
17
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
18
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
19
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
_ _
|
2
|
+
/\ /\ | |__ (_)
|
3
|
+
/ / \ \| '_ \ | |
|
4
|
+
\ \_/ /| |_) || |
|
5
|
+
\___/ |_.__/ |_|
|
6
|
+
|
7
|
+
|
8
|
+
Ubi finds information in the subject's webpage(s).
|
9
|
+
A forager.
|
10
|
+
|
11
|
+
|
12
|
+
## Thema -> Subject/Matter
|
13
|
+
|
14
|
+
- Name
|
15
|
+
- Email
|
16
|
+
- URL
|
17
|
+
|
18
|
+
|
19
|
+
## Memoria -> Attribute/Trait/Memory
|
20
|
+
|
21
|
+
- URLs
|
22
|
+
- Emails
|
23
|
+
- Phones
|
24
|
+
- Social
|
25
|
+
- Files
|
26
|
+
- Documents
|
27
|
+
- Address
|
28
|
+
- Logo/Images
|
29
|
+
|
30
|
+
|
31
|
+
## Aranea -> Spider/Crawler
|
32
|
+
|
33
|
+
Search
|
34
|
+
API
|
35
|
+
|
36
|
+
Others
|
37
|
+
|
38
|
+
|
39
|
+
## Use
|
40
|
+
|
41
|
+
```
|
42
|
+
Ubi::Thema.new('A Company on City').urls
|
43
|
+
Ubi::Thema.new('A Company on City', 'company.com').phones
|
44
|
+
```
|
45
|
+
|
46
|
+
### Memorias Directly
|
47
|
+
|
48
|
+
```
|
49
|
+
Ubi::Memoria::Email.parse('A text with some valid@emails.com')
|
50
|
+
Ubi::Memoria::Phone.parse('A text with some +55-5555-5555')
|
51
|
+
Ubi::Memoria::Site.parse('A text with some http://urls.com')
|
52
|
+
```
|
53
|
+
|
54
|
+
##### Keep crawling
|
55
|
+
|
56
|
+
> Ubi, and the class names are latin, in case you're curious.
|
57
|
+
> Yeah, apparently I got plenty of free time.
|
data/Rakefile
ADDED
data/bin/ubi
ADDED
data/lib/ubi.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'pry'
|
2
|
+
require 'thor'
|
3
|
+
require 'net/http'
|
4
|
+
require 'net/https'
|
5
|
+
require 'uri'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'json'
|
8
|
+
require 'phonelib'
|
9
|
+
require 'nokogiri'
|
10
|
+
require 'active_model'
|
11
|
+
require 'active_support'
|
12
|
+
require 'active_support/core_ext/hash'
|
13
|
+
require 'ubi/version'
|
14
|
+
|
15
|
+
# Ubiquous getters
|
16
|
+
module Ubi
|
17
|
+
cattr_accessor(:memorias) { [] }
|
18
|
+
cattr_accessor(:araneas) { [] }
|
19
|
+
|
20
|
+
# Phonie.configuration.default_country_code = '55'
|
21
|
+
# Phonie.configuration.n1_length = 4
|
22
|
+
end
|
23
|
+
|
24
|
+
require 'ubi/aranea'
|
25
|
+
require 'ubi/memoria'
|
26
|
+
require 'ubi/consultor'
|
27
|
+
|
28
|
+
%w( memorias consultores ).each do |ns|
|
29
|
+
Dir["#{File.dirname(__FILE__)}/ubi/#{ns}/**.rb"].sort.each do |lib|
|
30
|
+
require lib
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Loads after all
|
35
|
+
require 'ubi/artifex'
|
36
|
+
require 'ubi/datum'
|
37
|
+
require 'ubi/thema'
|
data/lib/ubi/aranea.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
require 'polipus'
|
2
|
+
|
3
|
+
module Ubi
|
4
|
+
# Base for araneas (spiders)
|
5
|
+
class Aranea
|
6
|
+
OPTIONS = {
|
7
|
+
workers: 3,
|
8
|
+
user_agent: "Ubi v#{Ubi::VERSION}",
|
9
|
+
depth_limit: 1
|
10
|
+
|
11
|
+
# storage: MemoryStore
|
12
|
+
}
|
13
|
+
attr_accessor :thema, :url, :datum
|
14
|
+
|
15
|
+
def initialize(thema, url, opts = {})
|
16
|
+
@thema = thema
|
17
|
+
@url = url
|
18
|
+
@opts = opts
|
19
|
+
end
|
20
|
+
|
21
|
+
delegate :name, to: :thema
|
22
|
+
|
23
|
+
def crawl!
|
24
|
+
Polipus.crawler(name, url, OPTIONS.merge(@opts)) do |crawler|
|
25
|
+
# In-place page processing
|
26
|
+
crawler.on_page_downloaded do |page|
|
27
|
+
# A nokogiri object
|
28
|
+
puts "'#{page.doc.css('title').text}' (#{page.url})"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def parser(chunk)
|
34
|
+
Nokogiri::HTML(chunk)
|
35
|
+
end
|
36
|
+
|
37
|
+
def datum
|
38
|
+
crawl! unless @datum
|
39
|
+
@datum
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
data/lib/ubi/artifex.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
module Ubi
|
2
|
+
# Artifex: I work!
|
3
|
+
class Artifex
|
4
|
+
include Consultor
|
5
|
+
attr_accessor :thema
|
6
|
+
|
7
|
+
def initialize(query)
|
8
|
+
@thema = Thema.new(query)
|
9
|
+
start_with_search
|
10
|
+
social_search
|
11
|
+
other_search
|
12
|
+
end
|
13
|
+
|
14
|
+
def start_with_search
|
15
|
+
[Google, Yahoo, Bing, DuckDuckGo].each do |s|
|
16
|
+
@thema.try_consultor(s)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def social_search
|
21
|
+
end
|
22
|
+
|
23
|
+
def other_search
|
24
|
+
end
|
25
|
+
|
26
|
+
delegate :spec, to: :thema
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module Ubi
|
2
|
+
module Consultor
|
3
|
+
# Base for araneas (spiders)
|
4
|
+
class Base
|
5
|
+
HEADERS = { 'User-Agent' => "Ubi v#{Ubi::VERSION}" }
|
6
|
+
|
7
|
+
def initialize(thema)
|
8
|
+
@thema = thema
|
9
|
+
end
|
10
|
+
|
11
|
+
def query_url
|
12
|
+
self.class.url + query.to_query
|
13
|
+
end
|
14
|
+
|
15
|
+
def links
|
16
|
+
'//a'
|
17
|
+
end
|
18
|
+
|
19
|
+
def parser(chunk)
|
20
|
+
Nokogiri::HTML(chunk)
|
21
|
+
end
|
22
|
+
|
23
|
+
def datum
|
24
|
+
@datum ||= Ubi::Datum.new(parser(request), words, links)
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Make an HTTP(S) request to a geocoding API and
|
29
|
+
# return the response object.
|
30
|
+
#
|
31
|
+
def request(opts = {})
|
32
|
+
timeout(10) do
|
33
|
+
uri = URI.parse(query_url)
|
34
|
+
puts "#{self} working on `#{@thema}` (#{query_url}) #{opts}"
|
35
|
+
uri.open(HEADERS).read
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
class << self
|
40
|
+
def inherited(base)
|
41
|
+
puts "Using aranea #{base}"
|
42
|
+
Ubi.araneas << base
|
43
|
+
end
|
44
|
+
|
45
|
+
#
|
46
|
+
# Human-readable name of the aranea
|
47
|
+
#
|
48
|
+
def name
|
49
|
+
fail "Not implemented by #{self}"
|
50
|
+
end
|
51
|
+
|
52
|
+
#
|
53
|
+
# Url to query
|
54
|
+
#
|
55
|
+
def url
|
56
|
+
fail "Not implemented by #{self}"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Ubi
|
2
|
+
module Consultor
|
3
|
+
# URL finder consultor
|
4
|
+
class Bing < Base
|
5
|
+
#
|
6
|
+
# query
|
7
|
+
def query
|
8
|
+
{ q: @thema.name, pc: 'ubi' }
|
9
|
+
end
|
10
|
+
|
11
|
+
def links
|
12
|
+
'//a[not (contains(@href,"bing"))]'
|
13
|
+
end
|
14
|
+
|
15
|
+
def words
|
16
|
+
'//li[@class="g"]'
|
17
|
+
end
|
18
|
+
|
19
|
+
class << self
|
20
|
+
def url
|
21
|
+
'https://bing.com/search?'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Ubi
|
2
|
+
module Consultor
|
3
|
+
# URL finder consultor
|
4
|
+
class DuckDuckGo < Base
|
5
|
+
#
|
6
|
+
# query
|
7
|
+
def query
|
8
|
+
{ q: @thema.name, t: 'ubi' }
|
9
|
+
end
|
10
|
+
|
11
|
+
def links
|
12
|
+
'//a[not (contains(@href,"duckduckgo"))]'
|
13
|
+
end
|
14
|
+
|
15
|
+
def words
|
16
|
+
'//li[@class="g"]'
|
17
|
+
end
|
18
|
+
|
19
|
+
class << self
|
20
|
+
def url
|
21
|
+
'https://duckduckgo.com/?'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Ubi
|
2
|
+
module Consultor
|
3
|
+
# URL finder consultor
|
4
|
+
class Google < Base # HTML
|
5
|
+
def query
|
6
|
+
{ q: @thema.name }
|
7
|
+
end
|
8
|
+
|
9
|
+
def links
|
10
|
+
'//a[not (contains(@href,"google"))]'
|
11
|
+
end
|
12
|
+
|
13
|
+
def words
|
14
|
+
'//div[@class="res"]'
|
15
|
+
end
|
16
|
+
|
17
|
+
class << self
|
18
|
+
def url
|
19
|
+
'https://www.google.com.br/search?'
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
File without changes
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Ubi
|
2
|
+
module Consultor
|
3
|
+
# URL finder consultor
|
4
|
+
class Yahoo < Base
|
5
|
+
#
|
6
|
+
# query
|
7
|
+
def query
|
8
|
+
{ p: @thema.name, hspart: 'ubi' }
|
9
|
+
end
|
10
|
+
|
11
|
+
def links
|
12
|
+
'//a[not (contains(@href,"yahoo"))]'
|
13
|
+
end
|
14
|
+
|
15
|
+
def words
|
16
|
+
'//li[@class="g"]'
|
17
|
+
end
|
18
|
+
|
19
|
+
class << self
|
20
|
+
def url
|
21
|
+
'https://search.yahoo.com/yhs/search?'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|