ubi 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +11 -0
  5. data/Guardfile +16 -0
  6. data/MIT-LICENSE +19 -0
  7. data/README.md +57 -0
  8. data/Rakefile +9 -0
  9. data/bin/ubi +10 -0
  10. data/lib/ubi.rb +37 -0
  11. data/lib/ubi/aranea.rb +42 -0
  12. data/lib/ubi/artifex.rb +28 -0
  13. data/lib/ubi/consultor.rb +61 -0
  14. data/lib/ubi/consultores/bing.rb +26 -0
  15. data/lib/ubi/consultores/duckduckgo.rb +26 -0
  16. data/lib/ubi/consultores/facebook.rb +6 -0
  17. data/lib/ubi/consultores/foursquare.rb +6 -0
  18. data/lib/ubi/consultores/google.rb +24 -0
  19. data/lib/ubi/consultores/linkedin.rb +0 -0
  20. data/lib/ubi/consultores/twitter.rb +6 -0
  21. data/lib/ubi/consultores/wikipedia.rb +6 -0
  22. data/lib/ubi/consultores/yahoo.rb +26 -0
  23. data/lib/ubi/datum.rb +43 -0
  24. data/lib/ubi/impero.rb +20 -0
  25. data/lib/ubi/memoria.rb +72 -0
  26. data/lib/ubi/memorias/address.rb +71 -0
  27. data/lib/ubi/memorias/document.rb +50 -0
  28. data/lib/ubi/memorias/email.rb +19 -0
  29. data/lib/ubi/memorias/phone.rb +33 -0
  30. data/lib/ubi/memorias/site.rb +29 -0
  31. data/lib/ubi/memorias/social.rb +20 -0
  32. data/lib/ubi/memorias/who.rb +20 -0
  33. data/lib/ubi/thema.rb +62 -0
  34. data/lib/ubi/version.rb +4 -0
  35. data/spec/fixtures/email.txt +5 -0
  36. data/spec/fixtures/mobile.txt +17 -0
  37. data/spec/fixtures/page.txt +21 -0
  38. data/spec/fixtures/phone.txt +17 -0
  39. data/spec/fixtures/site.txt +21 -0
  40. data/spec/spec_helper.rb +40 -0
  41. data/spec/ubi/aranea_spec.rb +19 -0
  42. data/spec/ubi/artifex_spec.rb +4 -0
  43. data/spec/ubi/memorias/address_spec.rb +56 -0
  44. data/spec/ubi/memorias/document_spec.rb +48 -0
  45. data/spec/ubi/memorias/email_spec.rb +59 -0
  46. data/spec/ubi/memorias/phone_spec.rb +79 -0
  47. data/spec/ubi/memorias/site_spec.rb +82 -0
  48. data/spec/ubi/thema_spec.rb +33 -0
  49. data/ubi.gemspec +39 -0
  50. metadata +232 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 330dfff07aab154b3fc714a71b10cf0f148d541d
4
+ data.tar.gz: 81ff08e9a049671a6f2c51c43641007466fa30c3
5
+ SHA512:
6
+ metadata.gz: fae80b06b1e6f5bac23357f8b4c09ef45879ba62b4e1e6e90c9e42a16490f7496cc614b27538beaed7d6529d8784fce29a79722e4a351379079cc0e4be6bf1cb
7
+ data.tar.gz: 4a4ec085b39e27a0e64addcf9c22fff3a969bdff21a72c8684bcb500451d3c4503eb1935383fd96715dd72257082f4851476b9b2c8d4a26244b6363b9264588b
@@ -0,0 +1,9 @@
1
+ .DS_Store
2
+ coverage
3
+ rdoc
4
+ pkg
5
+ .bundle
6
+ Gemfile.lock
7
+ *.gem
8
+ *.log
9
+ tmp/*
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --tty
2
+ --color
3
+ --format documentation
data/Gemfile ADDED
@@ -0,0 +1,11 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ gem 'pry'
6
+ gem 'rake'
7
+ gem 'rspec', '>= 3'
8
+ gem 'rubocop'
9
+ gem 'guard'
10
+ gem 'guard-rspec'
11
+ gem 'guard-rubocop'
@@ -0,0 +1,16 @@
1
+ #
2
+ # Ubi Guard
3
+ #
4
+ guard :rubocop, all_on_start: false, keep_failed: false, cli: ['-D'] do
5
+ watch(/.+\.rb$/)
6
+ watch(/(?:.+\/)?\.rubocop\.yml$/) { |m| File.dirname(m[0]) }
7
+ end
8
+
9
+ guard :rspec, cmd: 'bundle exec rspec' do
10
+ watch(/^spec\/.+_spec\.rb$/)
11
+ watch(/^lib\/(.+)\.rb$/) { |m| "spec/#{m[1]}_spec.rb" }
12
+ watch(/^generators\/(.+)\.rb$/) { |_m| 'spec/schemaless/worker_spec' }
13
+
14
+ watch('lib/ubi.rb') { 'spec' }
15
+ watch('spec/spec_helper.rb') { 'spec' }
16
+ end
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2014-2015 Ubi Authors
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a
4
+ copy of this software and associated documentation files (the "Software"),
5
+ to deal in the Software without restriction, including without limitation
6
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
7
+ and/or sell copies of the Software, and to permit persons to whom the
8
+ Software is furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be
11
+ included in all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
14
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
15
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
16
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
17
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
18
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
19
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,57 @@
1
+ _ _
2
+ /\ /\ | |__ (_)
3
+ / / \ \| '_ \ | |
4
+ \ \_/ /| |_) || |
5
+ \___/ |_.__/ |_|
6
+
7
+
8
+ Ubi finds information in the subject's webpage(s).
9
+ A forager.
10
+
11
+
12
+ ## Thema -> Subject/Matter
13
+
14
+ - Name
15
+ - Email
16
+ - URL
17
+
18
+
19
+ ## Memoria -> Attribute/Trait/Memory
20
+
21
+ - URLs
22
+ - Emails
23
+ - Phones
24
+ - Social
25
+ - Files
26
+ - Documents
27
+ - Address
28
+ - Logo/Images
29
+
30
+
31
+ ## Aranea -> Spider/Crawler
32
+
33
+ Search
34
+ API
35
+
36
+ Others
37
+
38
+
39
+ ## Use
40
+
41
+ ```
42
+ Ubi::Thema.new('A Company on City').urls
43
+ Ubi::Thema.new('A Company on City', 'company.com').phones
44
+ ```
45
+
46
+ ### Memorias Directly
47
+
48
+ ```
49
+ Ubi::Memoria::Email.parse('A text with some valid@emails.com')
50
+ Ubi::Memoria::Phone.parse('A text with some +55-5555-5555')
51
+ Ubi::Memoria::Site.parse('A text with some http://urls.com')
52
+ ```
53
+
54
+ ##### Keep crawling
55
+
56
+ > Ubi, and the class names are latin, in case you're curious.
57
+ > Yeah, apparently I got plenty of free time.
@@ -0,0 +1,9 @@
1
+ require 'bundler/gem_tasks'
2
+
3
+ require 'rspec/core/rake_task'
4
+ require 'rubocop/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new
7
+ RuboCop::RakeTask.new
8
+
9
+ task default: [:spec, :rubocop]
data/bin/ubi ADDED
@@ -0,0 +1,10 @@
1
+ #!/usr/bin/env ruby
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+
4
+ require 'ubi'
5
+ require 'ubi/impero'
6
+
7
+ puts
8
+ puts ' Ubi!'
9
+
10
+ Ubi::Impero.start(ARGV)
@@ -0,0 +1,37 @@
1
+ require 'pry'
2
+ require 'thor'
3
+ require 'net/http'
4
+ require 'net/https'
5
+ require 'uri'
6
+ require 'open-uri'
7
+ require 'json'
8
+ require 'phonelib'
9
+ require 'nokogiri'
10
+ require 'active_model'
11
+ require 'active_support'
12
+ require 'active_support/core_ext/hash'
13
+ require 'ubi/version'
14
+
15
+ # Ubiquous getters
16
+ module Ubi
17
+ cattr_accessor(:memorias) { [] }
18
+ cattr_accessor(:araneas) { [] }
19
+
20
+ # Phonie.configuration.default_country_code = '55'
21
+ # Phonie.configuration.n1_length = 4
22
+ end
23
+
24
+ require 'ubi/aranea'
25
+ require 'ubi/memoria'
26
+ require 'ubi/consultor'
27
+
28
+ %w( memorias consultores ).each do |ns|
29
+ Dir["#{File.dirname(__FILE__)}/ubi/#{ns}/**.rb"].sort.each do |lib|
30
+ require lib
31
+ end
32
+ end
33
+
34
+ # Loads after all
35
+ require 'ubi/artifex'
36
+ require 'ubi/datum'
37
+ require 'ubi/thema'
@@ -0,0 +1,42 @@
1
+ require 'polipus'
2
+
3
+ module Ubi
4
+ # Base for araneas (spiders)
5
+ class Aranea
6
+ OPTIONS = {
7
+ workers: 3,
8
+ user_agent: "Ubi v#{Ubi::VERSION}",
9
+ depth_limit: 1
10
+
11
+ # storage: MemoryStore
12
+ }
13
+ attr_accessor :thema, :url, :datum
14
+
15
+ def initialize(thema, url, opts = {})
16
+ @thema = thema
17
+ @url = url
18
+ @opts = opts
19
+ end
20
+
21
+ delegate :name, to: :thema
22
+
23
+ def crawl!
24
+ Polipus.crawler(name, url, OPTIONS.merge(@opts)) do |crawler|
25
+ # In-place page processing
26
+ crawler.on_page_downloaded do |page|
27
+ # A nokogiri object
28
+ puts "'#{page.doc.css('title').text}' (#{page.url})"
29
+ end
30
+ end
31
+ end
32
+
33
+ def parser(chunk)
34
+ Nokogiri::HTML(chunk)
35
+ end
36
+
37
+ def datum
38
+ crawl! unless @datum
39
+ @datum
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,28 @@
1
+ module Ubi
2
+ # Artifex: I work!
3
+ class Artifex
4
+ include Consultor
5
+ attr_accessor :thema
6
+
7
+ def initialize(query)
8
+ @thema = Thema.new(query)
9
+ start_with_search
10
+ social_search
11
+ other_search
12
+ end
13
+
14
+ def start_with_search
15
+ [Google, Yahoo, Bing, DuckDuckGo].each do |s|
16
+ @thema.try_consultor(s)
17
+ end
18
+ end
19
+
20
+ def social_search
21
+ end
22
+
23
+ def other_search
24
+ end
25
+
26
+ delegate :spec, to: :thema
27
+ end
28
+ end
@@ -0,0 +1,61 @@
1
+ module Ubi
2
+ module Consultor
3
+ # Base for araneas (spiders)
4
+ class Base
5
+ HEADERS = { 'User-Agent' => "Ubi v#{Ubi::VERSION}" }
6
+
7
+ def initialize(thema)
8
+ @thema = thema
9
+ end
10
+
11
+ def query_url
12
+ self.class.url + query.to_query
13
+ end
14
+
15
+ def links
16
+ '//a'
17
+ end
18
+
19
+ def parser(chunk)
20
+ Nokogiri::HTML(chunk)
21
+ end
22
+
23
+ def datum
24
+ @datum ||= Ubi::Datum.new(parser(request), words, links)
25
+ end
26
+
27
+ #
28
+ # Make an HTTP(S) request to a geocoding API and
29
+ # return the response object.
30
+ #
31
+ def request(opts = {})
32
+ timeout(10) do
33
+ uri = URI.parse(query_url)
34
+ puts "#{self} working on `#{@thema}` (#{query_url}) #{opts}"
35
+ uri.open(HEADERS).read
36
+ end
37
+ end
38
+
39
+ class << self
40
+ def inherited(base)
41
+ puts "Using aranea #{base}"
42
+ Ubi.araneas << base
43
+ end
44
+
45
+ #
46
+ # Human-readable name of the aranea
47
+ #
48
+ def name
49
+ fail "Not implemented by #{self}"
50
+ end
51
+
52
+ #
53
+ # Url to query
54
+ #
55
+ def url
56
+ fail "Not implemented by #{self}"
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,26 @@
1
+ module Ubi
2
+ module Consultor
3
+ # URL finder consultor
4
+ class Bing < Base
5
+ #
6
+ # query
7
+ def query
8
+ { q: @thema.name, pc: 'ubi' }
9
+ end
10
+
11
+ def links
12
+ '//a[not (contains(@href,"bing"))]'
13
+ end
14
+
15
+ def words
16
+ '//li[@class="g"]'
17
+ end
18
+
19
+ class << self
20
+ def url
21
+ 'https://bing.com/search?'
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,26 @@
1
+ module Ubi
2
+ module Consultor
3
+ # URL finder consultor
4
+ class DuckDuckGo < Base
5
+ #
6
+ # query
7
+ def query
8
+ { q: @thema.name, t: 'ubi' }
9
+ end
10
+
11
+ def links
12
+ '//a[not (contains(@href,"duckduckgo"))]'
13
+ end
14
+
15
+ def words
16
+ '//li[@class="g"]'
17
+ end
18
+
19
+ class << self
20
+ def url
21
+ 'https://duckduckgo.com/?'
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,6 @@
1
+ module Ubi
2
+ module Consultor
3
+ class Facebook < Base
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module Ubi
2
+ module Consultor
3
+ class Foursquare < Base
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,24 @@
1
+ module Ubi
2
+ module Consultor
3
+ # URL finder consultor
4
+ class Google < Base # HTML
5
+ def query
6
+ { q: @thema.name }
7
+ end
8
+
9
+ def links
10
+ '//a[not (contains(@href,"google"))]'
11
+ end
12
+
13
+ def words
14
+ '//div[@class="res"]'
15
+ end
16
+
17
+ class << self
18
+ def url
19
+ 'https://www.google.com.br/search?'
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
File without changes
@@ -0,0 +1,6 @@
1
+ module Ubi
2
+ module Consultor
3
+ class Twitter < Base
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ module Ubi
2
+ module Consultor
3
+ class Wikipedia < Base
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,26 @@
1
+ module Ubi
2
+ module Consultor
3
+ # URL finder consultor
4
+ class Yahoo < Base
5
+ #
6
+ # query
7
+ def query
8
+ { p: @thema.name, hspart: 'ubi' }
9
+ end
10
+
11
+ def links
12
+ '//a[not (contains(@href,"yahoo"))]'
13
+ end
14
+
15
+ def words
16
+ '//li[@class="g"]'
17
+ end
18
+
19
+ class << self
20
+ def url
21
+ 'https://search.yahoo.com/yhs/search?'
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end