ubi 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +11 -0
  5. data/Guardfile +16 -0
  6. data/MIT-LICENSE +19 -0
  7. data/README.md +57 -0
  8. data/Rakefile +9 -0
  9. data/bin/ubi +10 -0
  10. data/lib/ubi.rb +37 -0
  11. data/lib/ubi/aranea.rb +42 -0
  12. data/lib/ubi/artifex.rb +28 -0
  13. data/lib/ubi/consultor.rb +61 -0
  14. data/lib/ubi/consultores/bing.rb +26 -0
  15. data/lib/ubi/consultores/duckduckgo.rb +26 -0
  16. data/lib/ubi/consultores/facebook.rb +6 -0
  17. data/lib/ubi/consultores/foursquare.rb +6 -0
  18. data/lib/ubi/consultores/google.rb +24 -0
  19. data/lib/ubi/consultores/linkedin.rb +0 -0
  20. data/lib/ubi/consultores/twitter.rb +6 -0
  21. data/lib/ubi/consultores/wikipedia.rb +6 -0
  22. data/lib/ubi/consultores/yahoo.rb +26 -0
  23. data/lib/ubi/datum.rb +43 -0
  24. data/lib/ubi/impero.rb +20 -0
  25. data/lib/ubi/memoria.rb +72 -0
  26. data/lib/ubi/memorias/address.rb +71 -0
  27. data/lib/ubi/memorias/document.rb +50 -0
  28. data/lib/ubi/memorias/email.rb +19 -0
  29. data/lib/ubi/memorias/phone.rb +33 -0
  30. data/lib/ubi/memorias/site.rb +29 -0
  31. data/lib/ubi/memorias/social.rb +20 -0
  32. data/lib/ubi/memorias/who.rb +20 -0
  33. data/lib/ubi/thema.rb +62 -0
  34. data/lib/ubi/version.rb +4 -0
  35. data/spec/fixtures/email.txt +5 -0
  36. data/spec/fixtures/mobile.txt +17 -0
  37. data/spec/fixtures/page.txt +21 -0
  38. data/spec/fixtures/phone.txt +17 -0
  39. data/spec/fixtures/site.txt +21 -0
  40. data/spec/spec_helper.rb +40 -0
  41. data/spec/ubi/aranea_spec.rb +19 -0
  42. data/spec/ubi/artifex_spec.rb +4 -0
  43. data/spec/ubi/memorias/address_spec.rb +56 -0
  44. data/spec/ubi/memorias/document_spec.rb +48 -0
  45. data/spec/ubi/memorias/email_spec.rb +59 -0
  46. data/spec/ubi/memorias/phone_spec.rb +79 -0
  47. data/spec/ubi/memorias/site_spec.rb +82 -0
  48. data/spec/ubi/thema_spec.rb +33 -0
  49. data/ubi.gemspec +39 -0
  50. metadata +232 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 330dfff07aab154b3fc714a71b10cf0f148d541d
4
+ data.tar.gz: 81ff08e9a049671a6f2c51c43641007466fa30c3
5
+ SHA512:
6
+ metadata.gz: fae80b06b1e6f5bac23357f8b4c09ef45879ba62b4e1e6e90c9e42a16490f7496cc614b27538beaed7d6529d8784fce29a79722e4a351379079cc0e4be6bf1cb
7
+ data.tar.gz: 4a4ec085b39e27a0e64addcf9c22fff3a969bdff21a72c8684bcb500451d3c4503eb1935383fd96715dd72257082f4851476b9b2c8d4a26244b6363b9264588b
@@ -0,0 +1,9 @@
1
+ .DS_Store
2
+ coverage
3
+ rdoc
4
+ pkg
5
+ .bundle
6
+ Gemfile.lock
7
+ *.gem
8
+ *.log
9
+ tmp/*
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --tty
2
+ --color
3
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ gem 'pry'
6
+ gem 'rake'
7
+ gem 'rspec', '>= 3'
8
+ gem 'rubocop'
9
+ gem 'guard'
10
+ gem 'guard-rspec'
11
+ gem 'guard-rubocop'
@@ -0,0 +1,16 @@
1
+ #
2
+ # Ubi Guard
3
+ #
4
+ guard :rubocop, all_on_start: false, keep_failed: false, cli: ['-D'] do
5
+ watch(/.+\.rb$/)
6
+ watch(/(?:.+\/)?\.rubocop\.yml$/) { |m| File.dirname(m[0]) }
7
+ end
8
+
9
+ guard :rspec, cmd: 'bundle exec rspec' do
10
+ watch(/^spec\/.+_spec\.rb$/)
11
+ watch(/^lib\/(.+)\.rb$/) { |m| "spec/#{m[1]}_spec.rb" }
12
+ watch(/^generators\/(.+)\.rb$/) { |_m| 'spec/schemaless/worker_spec' }
13
+
14
+ watch('lib/ubi.rb') { 'spec' }
15
+ watch('spec/spec_helper.rb') { 'spec' }
16
+ end
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2014-2015 Ubi Authors
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a
4
+ copy of this software and associated documentation files (the "Software"),
5
+ to deal in the Software without restriction, including without limitation
6
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
7
+ and/or sell copies of the Software, and to permit persons to whom the
8
+ Software is furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be
11
+ included in all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
14
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
15
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
16
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
17
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
18
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
19
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,57 @@
1
+ _ _
2
+ /\ /\ | |__ (_)
3
+ / / \ \| '_ \ | |
4
+ \ \_/ /| |_) || |
5
+ \___/ |_.__/ |_|
6
+
7
+
8
+ Ubi finds information in the subject's webpage(s).
9
+ A forager.
10
+
11
+
12
+ ## Thema -> Subject/Matter
13
+
14
+ - Name
15
+ - Email
16
+ - URL
17
+
18
+
19
+ ## Memoria -> Attribute/Trait/Memory
20
+
21
+ - URLs
22
+ - Emails
23
+ - Phones
24
+ - Social
25
+ - Files
26
+ - Documents
27
+ - Address
28
+ - Logo/Images
29
+
30
+
31
+ ## Aranea -> Spider/Crawler
32
+
33
+ Search
34
+ API
35
+
36
+ Others
37
+
38
+
39
+ ## Use
40
+
41
+ ```
42
+ Ubi::Thema.new('A Company on City').urls
43
+ Ubi::Thema.new('A Company on City', 'company.com').phones
44
+ ```
45
+
46
+ ### Memorias Directly
47
+
48
+ ```
49
+ Ubi::Memoria::Email.parse('A text with some valid@emails.com')
50
+ Ubi::Memoria::Phone.parse('A text with some +55-5555-5555')
51
+ Ubi::Memoria::Site.parse('A text with some http://urls.com')
52
+ ```
53
+
54
+ ##### Keep crawling
55
+
56
+ > Ubi, and the class names are latin, in case you're curious.
57
+ > Yeah, apparently I got plenty of free time.
@@ -0,0 +1,9 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ require 'rspec/core/rake_task'
4
+ require 'rubocop/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new
7
+ RuboCop::RakeTask.new
8
+
9
+ task default: [:spec, :rubocop]
data/bin/ubi ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+
4
+ require 'ubi'
5
+ require 'ubi/impero'
6
+
7
+ puts
8
+ puts ' Ubi!'
9
+
10
+ Ubi::Impero.start(ARGV)
@@ -0,0 +1,37 @@
1
+ require 'pry'
2
+ require 'thor'
3
+ require 'net/http'
4
+ require 'net/https'
5
+ require 'uri'
6
+ require 'open-uri'
7
+ require 'json'
8
+ require 'phonelib'
9
+ require 'nokogiri'
10
+ require 'active_model'
11
+ require 'active_support'
12
+ require 'active_support/core_ext/hash'
13
+ require 'ubi/version'
14
+
15
+ # Ubiquous getters
16
+ module Ubi
17
+ cattr_accessor(:memorias) { [] }
18
+ cattr_accessor(:araneas) { [] }
19
+
20
+ # Phonie.configuration.default_country_code = '55'
21
+ # Phonie.configuration.n1_length = 4
22
+ end
23
+
24
+ require 'ubi/aranea'
25
+ require 'ubi/memoria'
26
+ require 'ubi/consultor'
27
+
28
+ %w( memorias consultores ).each do |ns|
29
+ Dir["#{File.dirname(__FILE__)}/ubi/#{ns}/**.rb"].sort.each do |lib|
30
+ require lib
31
+ end
32
+ end
33
+
34
+ # Loads after all
35
+ require 'ubi/artifex'
36
+ require 'ubi/datum'
37
+ require 'ubi/thema'
@@ -0,0 +1,42 @@
1
+ require 'polipus'
2
+
3
+ module Ubi
4
+ # Base for araneas (spiders)
5
+ class Aranea
6
+ OPTIONS = {
7
+ workers: 3,
8
+ user_agent: "Ubi v#{Ubi::VERSION}",
9
+ depth_limit: 1
10
+
11
+ # storage: MemoryStore
12
+ }
13
+ attr_accessor :thema, :url, :datum
14
+
15
+ def initialize(thema, url, opts = {})
16
+ @thema = thema
17
+ @url = url
18
+ @opts = opts
19
+ end
20
+
21
+ delegate :name, to: :thema
22
+
23
+ def crawl!
24
+ Polipus.crawler(name, url, OPTIONS.merge(@opts)) do |crawler|
25
+ # In-place page processing
26
+ crawler.on_page_downloaded do |page|
27
+ # A nokogiri object
28
+ puts "'#{page.doc.css('title').text}' (#{page.url})"
29
+ end
30
+ end
31
+ end
32
+
33
+ def parser(chunk)
34
+ Nokogiri::HTML(chunk)
35
+ end
36
+
37
+ def datum
38
+ crawl! unless @datum
39
+ @datum
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,28 @@
1
+ module Ubi
2
+ # Artifex: I work!
3
+ class Artifex
4
+ include Consultor
5
+ attr_accessor :thema
6
+
7
+ def initialize(query)
8
+ @thema = Thema.new(query)
9
+ start_with_search
10
+ social_search
11
+ other_search
12
+ end
13
+
14
+ def start_with_search
15
+ [Google, Yahoo, Bing, DuckDuckGo].each do |s|
16
+ @thema.try_consultor(s)
17
+ end
18
+ end
19
+
20
+ def social_search
21
+ end
22
+
23
+ def other_search
24
+ end
25
+
26
+ delegate :spec, to: :thema
27
+ end
28
+ end
@@ -0,0 +1,61 @@
1
+ module Ubi
2
+ module Consultor
3
+ # Base for araneas (spiders)
4
+ class Base
5
+ HEADERS = { 'User-Agent' => "Ubi v#{Ubi::VERSION}" }
6
+
7
+ def initialize(thema)
8
+ @thema = thema
9
+ end
10
+
11
+ def query_url
12
+ self.class.url + query.to_query
13
+ end
14
+
15
+ def links
16
+ '//a'
17
+ end
18
+
19
+ def parser(chunk)
20
+ Nokogiri::HTML(chunk)
21
+ end
22
+
23
+ def datum
24
+ @datum ||= Ubi::Datum.new(parser(request), words, links)
25
+ end
26
+
27
+ #
28
+ # Make an HTTP(S) request to a geocoding API and
29
+ # return the response object.
30
+ #
31
+ def request(opts = {})
32
+ timeout(10) do
33
+ uri = URI.parse(query_url)
34
+ puts "#{self} working on `#{@thema}` (#{query_url}) #{opts}"
35
+ uri.open(HEADERS).read
36
+ end
37
+ end
38
+
39
+ class << self
40
+ def inherited(base)
41
+ puts "Using aranea #{base}"
42
+ Ubi.araneas << base
43
+ end
44
+
45
+ #
46
+ # Human-readable name of the aranea
47
+ #
48
+ def name
49
+ fail "Not implemented by #{self}"
50
+ end
51
+
52
+ #
53
+ # Url to query
54
+ #
55
+ def url
56
+ fail "Not implemented by #{self}"
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,26 @@
1
+ module Ubi
2
+ module Consultor
3
+ # URL finder consultor
4
+ class Bing < Base
5
+ #
6
+ # query
7
+ def query
8
+ { q: @thema.name, pc: 'ubi' }
9
+ end
10
+
11
+ def links
12
+ '//a[not (contains(@href,"bing"))]'
13
+ end
14
+
15
+ def words
16
+ '//li[@class="g"]'
17
+ end
18
+
19
+ class << self
20
+ def url
21
+ 'https://bing.com/search?'
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,26 @@
1
+ module Ubi
2
+ module Consultor
3
+ # URL finder consultor
4
+ class DuckDuckGo < Base
5
+ #
6
+ # query
7
+ def query
8
+ { q: @thema.name, t: 'ubi' }
9
+ end
10
+
11
+ def links
12
+ '//a[not (contains(@href,"duckduckgo"))]'
13
+ end
14
+
15
+ def words
16
+ '//li[@class="g"]'
17
+ end
18
+
19
+ class << self
20
+ def url
21
+ 'https://duckduckgo.com/?'
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,6 @@
1
+ module Ubi
2
+ module Consultor
3
+ class Facebook < Base
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module Ubi
2
+ module Consultor
3
+ class Foursquare < Base
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,24 @@
1
+ module Ubi
2
+ module Consultor
3
+ # URL finder consultor
4
+ class Google < Base # HTML
5
+ def query
6
+ { q: @thema.name }
7
+ end
8
+
9
+ def links
10
+ '//a[not (contains(@href,"google"))]'
11
+ end
12
+
13
+ def words
14
+ '//div[@class="res"]'
15
+ end
16
+
17
+ class << self
18
+ def url
19
+ 'https://www.google.com.br/search?'
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
File without changes
@@ -0,0 +1,6 @@
1
+ module Ubi
2
+ module Consultor
3
+ class Twitter < Base
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module Ubi
2
+ module Consultor
3
+ class Wikipedia < Base
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,26 @@
1
+ module Ubi
2
+ module Consultor
3
+ # URL finder consultor
4
+ class Yahoo < Base
5
+ #
6
+ # query
7
+ def query
8
+ { p: @thema.name, hspart: 'ubi' }
9
+ end
10
+
11
+ def links
12
+ '//a[not (contains(@href,"yahoo"))]'
13
+ end
14
+
15
+ def words
16
+ '//li[@class="g"]'
17
+ end
18
+
19
+ class << self
20
+ def url
21
+ 'https://search.yahoo.com/yhs/search?'
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end