omni_scrapper 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/omni_scrapper.rb +11 -0
- data/lib/omni_scrapper/configuration.rb +53 -0
- data/lib/omni_scrapper/exceptions/crawler_not_defined_exception.rb +3 -0
- data/lib/omni_scrapper/exceptions/invalid_field_arguments_exception.rb +7 -0
- data/lib/omni_scrapper/exceptions/unknown_framework_exception.rb +7 -0
- data/lib/omni_scrapper/exceptions/unsupported_framework_exception.rb +7 -0
- data/lib/omni_scrapper/file_utils.rb +87 -0
- data/lib/omni_scrapper/normalizers.rb +6 -0
- data/lib/omni_scrapper/normalizers/base.rb +15 -0
- data/lib/omni_scrapper/normalizers/phone.rb +12 -0
- data/lib/omni_scrapper/page.rb +92 -0
- data/lib/omni_scrapper/rake.rb +6 -0
- data/lib/omni_scrapper/result.rb +29 -0
- data/lib/omni_scrapper/schema.rb +13 -0
- data/lib/omni_scrapper/scrapper.rb +35 -0
- data/lib/omni_scrapper/scrapper_builder.rb +75 -0
- data/lib/tasks/generate.rake +13 -0
- data/lib/tasks/install.rake +17 -0
- data/omni_scrapper.gemspec +13 -0
- data/todo.md +3 -0
- metadata +22 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8067759f0e4de39da95f44a037f03d65c31ea05a
|
4
|
+
data.tar.gz: be04bec953f8943e3e719f708d8b593dbbc71150
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8995c80582d6bda9ed4a317185970d946aa786371fb03a3ac8e0387dc3041d482f24eddaa7d9af954ff25b9d5604e1d16139e00913b2c532b16c994e78c2cac2
|
7
|
+
data.tar.gz: 036a83f2936f563f34050fb5db311be780848c6212de661f045e19dd1a0e72e90046dd39ccc68f574ef56f996bb31160df47d4a2dc268743b1f6c73f002a2868
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'omni_scrapper/normalizers'
|
3
|
+
require 'omni_scrapper/result'
|
4
|
+
require 'omni_scrapper/file_utils'
|
5
|
+
require 'omni_scrapper/page'
|
6
|
+
require 'omni_scrapper/schema'
|
7
|
+
require 'omni_scrapper/configuration'
|
8
|
+
require 'omni_scrapper/scrapper_builder'
|
9
|
+
require 'omni_scrapper/scrapper'
|
10
|
+
|
11
|
+
OmniScrapper::FileUtils.userspace_files.each { |file| require("./#{file}") }
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'omni_scrapper/exceptions/crawler_not_defined_exception'
|
2
|
+
|
3
|
+
module OmniScrapper
|
4
|
+
class Configuration
|
5
|
+
attr_reader :fields, :anchors
|
6
|
+
|
7
|
+
SINGLE_OPTS = %i[ do method ]
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
@fields = {}
|
11
|
+
@anchors = {}
|
12
|
+
end
|
13
|
+
|
14
|
+
def field(name, options = {})
|
15
|
+
#validate_crawler_presence!(options)
|
16
|
+
validate_field_options!(options)
|
17
|
+
# TODO: validate if field method is defined
|
18
|
+
@fields[name] = options
|
19
|
+
end
|
20
|
+
|
21
|
+
def method_missing(method_name, *args, &block)
|
22
|
+
if args.empty?
|
23
|
+
get_variable(method_name)
|
24
|
+
else
|
25
|
+
set_variable(method_name, args)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def set_variable(name, options)
|
32
|
+
@anchors[name] = { pattern: options[0] }
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_variable(name)
|
36
|
+
# TODO: raise error if unexisting field is requested
|
37
|
+
@anchors[name][:pattern]
|
38
|
+
end
|
39
|
+
|
40
|
+
def validate_crawler_presence!(options)
|
41
|
+
#return if
|
42
|
+
fail OmniScrapper::CrawlerNotDefinedException
|
43
|
+
end
|
44
|
+
|
45
|
+
def validate_field_options!(options)
|
46
|
+
incorrect_options = (SINGLE_OPTS & options.keys).first
|
47
|
+
return unless incorrect_options && options.keys.size > 1
|
48
|
+
incompatible_options = (options.keys - [incorrect_options]).map { |i| ":#{i}" }.join(', ')
|
49
|
+
exception_message = ":#{incorrect_options} can not be used with other args(#{incompatible_options})"
|
50
|
+
fail OmniScrapper::InvalidFieldArgumentsException, exception_message
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module OmniScrapper
|
2
|
+
module FileUtils
|
3
|
+
BASE_NAME = ''
|
4
|
+
DIRS = %w( schemas crawlers scrappers normalizers )
|
5
|
+
|
6
|
+
class << self
|
7
|
+
def install
|
8
|
+
Dir.mkdir(installation_location)
|
9
|
+
DIRS.each do |dir|
|
10
|
+
Dir.mkdir("#{installation_location}/#{dir}")
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def generate_scrapper(name)
|
15
|
+
Dir.mkdir("#{installation_location}/scrappers/#{name}")
|
16
|
+
File.open("#{installation_location}/scrappers/#{name}/scrapper.rb", "w+") do |f|
|
17
|
+
f.write(scrapper_template(name))
|
18
|
+
end
|
19
|
+
|
20
|
+
File.open("#{installation_location}/scrappers/#{name}/scrap_methods.rb", "w+") do |f|
|
21
|
+
f.write(scrap_methods_template(name))
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def userspace_files
|
26
|
+
Dir.glob(File.join(installation_location, "**", "*.rb"))
|
27
|
+
end
|
28
|
+
|
29
|
+
def installed?
|
30
|
+
Dir.exists?(installation_location)
|
31
|
+
end
|
32
|
+
|
33
|
+
def installation_location
|
34
|
+
if hanami?
|
35
|
+
'apps/scrappers'
|
36
|
+
elsif rails?
|
37
|
+
fail OmniScrapper::UnsupportedFrameworkException, 'Rails is not supported yet.'
|
38
|
+
else
|
39
|
+
fail OmniScrapper::UnknownFrameworkException
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def hanami?
|
44
|
+
defined? Hanami
|
45
|
+
end
|
46
|
+
|
47
|
+
def rails?
|
48
|
+
defined? Rails
|
49
|
+
end
|
50
|
+
|
51
|
+
def scrapper_template(name)
|
52
|
+
<<-TEMPLATE
|
53
|
+
require_relative '../gallery'
|
54
|
+
require_relative '../schema'
|
55
|
+
|
56
|
+
# Usage example:
|
57
|
+
# Scrappers::#{name}::Scrapper.run { |data| p data }
|
58
|
+
module Scrappers
|
59
|
+
module #{name}
|
60
|
+
class Scrapper < ::Scrappers::Gallery
|
61
|
+
include OmniScrapper
|
62
|
+
|
63
|
+
setup do |config|
|
64
|
+
config.schema ::Scrappers::Schema
|
65
|
+
config.crawler ::Scrappers::Gallery
|
66
|
+
|
67
|
+
config.entrypoint ''
|
68
|
+
config.next_page_link ''
|
69
|
+
|
70
|
+
config.field :name,
|
71
|
+
selector: ''
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
TEMPLATE
|
77
|
+
end
|
78
|
+
|
79
|
+
def scrap_methods_template(name)
|
80
|
+
<<-TEMPLATE
|
81
|
+
module ScrapMethods
|
82
|
+
end
|
83
|
+
TEMPLATE
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
module OmniScrapper
|
2
|
+
class Page
|
3
|
+
attr_accessor :page, :config
|
4
|
+
|
5
|
+
def initialize(page, config)
|
6
|
+
self.page = page
|
7
|
+
self.config = config
|
8
|
+
end
|
9
|
+
|
10
|
+
def data
|
11
|
+
result_data = prepare_data
|
12
|
+
validate_data!(result_data)
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
# TODO: should be moved to scrapper
|
18
|
+
def prepare_data
|
19
|
+
config.fields.reduce({}) do |result, (field_name, field_options)|
|
20
|
+
value = get_field(field_options)
|
21
|
+
result.merge(field_name => value)
|
22
|
+
end.merge(id_within_site: id_within_site)
|
23
|
+
end
|
24
|
+
|
25
|
+
def validate_data!(result_data)
|
26
|
+
# TODO: Fix this is not working
|
27
|
+
# schema_pattern method is not available here
|
28
|
+
# uncomment when move this to scrapper level
|
29
|
+
config.schema.new.validate!(result_data) if config.schema
|
30
|
+
result_data
|
31
|
+
end
|
32
|
+
|
33
|
+
def id_within_site
|
34
|
+
extract(page.uri.to_s, config.anchors[:id_within_site][:pattern])
|
35
|
+
end
|
36
|
+
|
37
|
+
def get_field(options)
|
38
|
+
return __send__(options[:method], page) if options[:method]
|
39
|
+
return options[:do].call(page) if options[:do]
|
40
|
+
|
41
|
+
value = find(options[:selector])
|
42
|
+
value = normalize(value, options[:normalizer])
|
43
|
+
value = extract(value, options[:pattern])
|
44
|
+
value = type_cast(value, options[:type_cast_to])
|
45
|
+
value
|
46
|
+
end
|
47
|
+
|
48
|
+
def find(selector)
|
49
|
+
selector = selector.gsub('/tbody', '')
|
50
|
+
page.xpath(selector).text.strip
|
51
|
+
end
|
52
|
+
|
53
|
+
def extract(value, pattern)
|
54
|
+
return value unless pattern
|
55
|
+
value.scan(pattern).flatten.first
|
56
|
+
end
|
57
|
+
|
58
|
+
def type_cast(value, type_class)
|
59
|
+
return value unless type_class
|
60
|
+
|
61
|
+
case type_class.to_s
|
62
|
+
when 'Integer'
|
63
|
+
value.to_i
|
64
|
+
else
|
65
|
+
value
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def normalize(value, normalizer)
|
70
|
+
case normalizer
|
71
|
+
when Class
|
72
|
+
normalizer.new(value).normalized
|
73
|
+
when Symbol
|
74
|
+
normalizer_name = normalizer.to_s.split('_').map { |w| w.capitalize }.join
|
75
|
+
normalizer_class = normalizers_namespace.const_get(normalizer_name)
|
76
|
+
normalizer_class.new(value).normalized
|
77
|
+
when Proc
|
78
|
+
normalizer.call(value)
|
79
|
+
when NilClass
|
80
|
+
value
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def normalizers_namespace
|
85
|
+
root_namespace.const_get('Normalizers')
|
86
|
+
end
|
87
|
+
|
88
|
+
def root_namespace
|
89
|
+
@root_namespace ||= Kernel.const_get('OmniScrapper')
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module OmniScrapper
|
2
|
+
class Result
|
3
|
+
attr_accessor :scrapper_name, :data, :timestamp, :checksum
|
4
|
+
|
5
|
+
def initialize(scrapper_name)
|
6
|
+
self.scrapper_name = scrapper_name
|
7
|
+
self.timestamp = Time.now
|
8
|
+
end
|
9
|
+
|
10
|
+
def build(data)
|
11
|
+
self.data = data
|
12
|
+
self.checksum = Signature.new(data).calculate
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
class Signature
|
18
|
+
attr_accessor :data
|
19
|
+
|
20
|
+
def initialize(data)
|
21
|
+
self.data = data
|
22
|
+
end
|
23
|
+
|
24
|
+
def calculate
|
25
|
+
Digest::MD5.hexdigest(data.to_s)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module OmniScrapper
|
2
|
+
def self.included(base)
|
3
|
+
base.extend(OmniScrapper::ClassMethods)
|
4
|
+
end
|
5
|
+
|
6
|
+
class << self
|
7
|
+
def setup(scrapper_name)
|
8
|
+
config = OmniScrapper::Configuration.new
|
9
|
+
yield(config)
|
10
|
+
OmniScrapper::ScrapperBuilder.new(scrapper_name, config).define_classes
|
11
|
+
end
|
12
|
+
|
13
|
+
def scrappers
|
14
|
+
ObjectSpace.each_object(Class).select { |klass| klass < self }
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
module ClassMethods
|
19
|
+
def run(&block)
|
20
|
+
new.run(&block)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def initialize(entrypoint_url = nil)
|
25
|
+
self.entrypoint = entrypoint_url || entrypoint_pattern
|
26
|
+
self.agent = Mechanize.new do |a|
|
27
|
+
a.user_agent_alias = 'Mac Safari'
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def scrap_page(page)
|
32
|
+
data = scrapper_page_class.new(page, configuration).data
|
33
|
+
OmniScrapper::Result.new(name).tap { |result| result.build(data) }
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module OmniScrapper
|
2
|
+
class ScrapperBuilder
|
3
|
+
attr_accessor :scrapper_name, :config
|
4
|
+
|
5
|
+
def initialize(scrapper_name, config)
|
6
|
+
self.scrapper_name = scrapper_name
|
7
|
+
self.config = config
|
8
|
+
end
|
9
|
+
|
10
|
+
def define_classes
|
11
|
+
define_scrapper_class(scrapper_name, config)
|
12
|
+
define_page_class
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def define_scrapper_class(scrapper_name, config)
|
18
|
+
current_module = scrapper_module
|
19
|
+
klass = Class.new(config.crawler) do
|
20
|
+
include OmniScrapper
|
21
|
+
|
22
|
+
config.anchors.each do |name, options|
|
23
|
+
define_method("#{name}_pattern") do
|
24
|
+
options[:pattern]
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
define_method :configuration do
|
29
|
+
config
|
30
|
+
end
|
31
|
+
|
32
|
+
define_method :name do
|
33
|
+
scrapper_name
|
34
|
+
end
|
35
|
+
|
36
|
+
define_method :scrapper_page_class do
|
37
|
+
current_module.const_get('Page')
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
Object.const_set(scrappers_namespace_name, Module.new) unless defined? scrappers_namespace_module
|
42
|
+
scrappers_namespace_module.const_set(scrapper_name.to_s.capitalize, Module.new) unless defined? scrapper_module
|
43
|
+
scrapper_module.const_set(scrapper_class_name, klass)
|
44
|
+
end
|
45
|
+
|
46
|
+
def define_page_class
|
47
|
+
page_class = scrapper_module.const_set('Page', Class.new(OmniScrapper::Page))
|
48
|
+
page_class.__send__(:include, class_methods_module) if scrapper_module.const_defined?('ScrapMethods')
|
49
|
+
end
|
50
|
+
|
51
|
+
def scrappers_namespace_module
|
52
|
+
@scrappers_namespace_module ||= Object.const_get(scrappers_namespace_name)
|
53
|
+
end
|
54
|
+
|
55
|
+
def scrappers_namespace_name
|
56
|
+
:Scrappers
|
57
|
+
end
|
58
|
+
|
59
|
+
def scrapper_class_name
|
60
|
+
:Scrapper
|
61
|
+
end
|
62
|
+
|
63
|
+
def scrapper_module
|
64
|
+
@scrapper_module ||= Object.const_get("#{scrappers_namespace_name}::#{scrapper_name.capitalize}")
|
65
|
+
end
|
66
|
+
|
67
|
+
def class_methods_module
|
68
|
+
@class_methods_module ||= Object.const_get scrapper_module_array.push('ScrapMethods').join('::')
|
69
|
+
end
|
70
|
+
|
71
|
+
def scrapper_module_array
|
72
|
+
[scrappers_namespace_name.to_s, scrapper_name.to_s.capitalize]
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'omni_scrapper/file_utils'
|
2
|
+
|
3
|
+
namespace :omni_scrapper do
|
4
|
+
desc 'Generate scrapper'
|
5
|
+
task :generate, :name do |t, args|
|
6
|
+
if OmniScrapper::FileUtils.installed?
|
7
|
+
OmniScrapper::FileUtils.generate_scrapper(args[:name])
|
8
|
+
p 'Scrapper is generated'
|
9
|
+
else
|
10
|
+
p 'OmniScrapper is not installed. Please run `rake omni_scrapper:install` before.'
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'omni_scrapper/file_utils'
|
2
|
+
|
3
|
+
namespace :omni_scrapper do
|
4
|
+
desc 'Install'
|
5
|
+
task :install do
|
6
|
+
if defined? Hanami
|
7
|
+
p 'Hanami framework detected.'
|
8
|
+
|
9
|
+
if OmniScrapper::FileUtils.installed?
|
10
|
+
p 'OmniScrapper is already installed'
|
11
|
+
else
|
12
|
+
p 'Installing'
|
13
|
+
OmniScrapper::FileUtils.install
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = "omni_scrapper"
|
3
|
+
s.version = "1.0.1"
|
4
|
+
s.licenses = ['MIT']
|
5
|
+
s.summary = "This is an example!"
|
6
|
+
s.description = "Much longer explanation of the example!"
|
7
|
+
s.author = "Stanislav Mekhonoshin"
|
8
|
+
s.email = "ejabberd@gmail.com"
|
9
|
+
s.require_paths = ["lib"]
|
10
|
+
s.files = `git ls-files`.split($/)
|
11
|
+
|
12
|
+
s.add_runtime_dependency "mechanize"
|
13
|
+
end
|
data/todo.md
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: omni_scrapper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Stanislav Mekhonoshin
|
@@ -29,7 +29,27 @@ email: ejabberd@gmail.com
|
|
29
29
|
executables: []
|
30
30
|
extensions: []
|
31
31
|
extra_rdoc_files: []
|
32
|
-
files:
|
32
|
+
files:
|
33
|
+
- lib/omni_scrapper.rb
|
34
|
+
- lib/omni_scrapper/configuration.rb
|
35
|
+
- lib/omni_scrapper/exceptions/crawler_not_defined_exception.rb
|
36
|
+
- lib/omni_scrapper/exceptions/invalid_field_arguments_exception.rb
|
37
|
+
- lib/omni_scrapper/exceptions/unknown_framework_exception.rb
|
38
|
+
- lib/omni_scrapper/exceptions/unsupported_framework_exception.rb
|
39
|
+
- lib/omni_scrapper/file_utils.rb
|
40
|
+
- lib/omni_scrapper/normalizers.rb
|
41
|
+
- lib/omni_scrapper/normalizers/base.rb
|
42
|
+
- lib/omni_scrapper/normalizers/phone.rb
|
43
|
+
- lib/omni_scrapper/page.rb
|
44
|
+
- lib/omni_scrapper/rake.rb
|
45
|
+
- lib/omni_scrapper/result.rb
|
46
|
+
- lib/omni_scrapper/schema.rb
|
47
|
+
- lib/omni_scrapper/scrapper.rb
|
48
|
+
- lib/omni_scrapper/scrapper_builder.rb
|
49
|
+
- lib/tasks/generate.rake
|
50
|
+
- lib/tasks/install.rake
|
51
|
+
- omni_scrapper.gemspec
|
52
|
+
- todo.md
|
33
53
|
homepage:
|
34
54
|
licenses:
|
35
55
|
- MIT
|