omni_scrapper 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ce4ead1a8478e6bc48b9588ae79f417a7cac6093
4
- data.tar.gz: 73022f26551c76efc579773e9d3670a21249b7ac
3
+ metadata.gz: 8067759f0e4de39da95f44a037f03d65c31ea05a
4
+ data.tar.gz: be04bec953f8943e3e719f708d8b593dbbc71150
5
5
  SHA512:
6
- metadata.gz: 218d997e820e1b77eb2629ebed9c67f4b3bdd077082a2cf0dc347d77a9190ec525ff1e82c6dcda84d0389f9bebcd0cab550a0c2911a4fc02d7748ee178844d3f
7
- data.tar.gz: 1edd10a0e2f92247e3e4a033763d5b7cd582b0399795c3c745daca21acdec4d384189400488281661a6b5524af23356642b14e73c3cc73b7909245a7dfb18586
6
+ metadata.gz: 8995c80582d6bda9ed4a317185970d946aa786371fb03a3ac8e0387dc3041d482f24eddaa7d9af954ff25b9d5604e1d16139e00913b2c532b16c994e78c2cac2
7
+ data.tar.gz: 036a83f2936f563f34050fb5db311be780848c6212de661f045e19dd1a0e72e90046dd39ccc68f574ef56f996bb31160df47d4a2dc268743b1f6c73f002a2868
@@ -0,0 +1,11 @@
1
+ require 'mechanize'
2
+ require 'omni_scrapper/normalizers'
3
+ require 'omni_scrapper/result'
4
+ require 'omni_scrapper/file_utils'
5
+ require 'omni_scrapper/page'
6
+ require 'omni_scrapper/schema'
7
+ require 'omni_scrapper/configuration'
8
+ require 'omni_scrapper/scrapper_builder'
9
+ require 'omni_scrapper/scrapper'
10
+
11
+ OmniScrapper::FileUtils.userspace_files.each { |file| require("./#{file}") }
@@ -0,0 +1,53 @@
1
+ require 'omni_scrapper/exceptions/crawler_not_defined_exception'
2
+
3
+ module OmniScrapper
4
+ class Configuration
5
+ attr_reader :fields, :anchors
6
+
7
+ SINGLE_OPTS = %i[ do method ]
8
+
9
+ def initialize
10
+ @fields = {}
11
+ @anchors = {}
12
+ end
13
+
14
+ def field(name, options = {})
15
+ #validate_crawler_presence!(options)
16
+ validate_field_options!(options)
17
+ # TODO: validate if field method is defined
18
+ @fields[name] = options
19
+ end
20
+
21
+ def method_missing(method_name, *args, &block)
22
+ if args.empty?
23
+ get_variable(method_name)
24
+ else
25
+ set_variable(method_name, args)
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def set_variable(name, options)
32
+ @anchors[name] = { pattern: options[0] }
33
+ end
34
+
35
+ def get_variable(name)
36
+ # TODO: raise error if unexisting field is requested
37
+ @anchors[name][:pattern]
38
+ end
39
+
40
+ def validate_crawler_presence!(options)
41
+ #return if
42
+ fail OmniScrapper::CrawlerNotDefinedException
43
+ end
44
+
45
+ def validate_field_options!(options)
46
+ incorrect_options = (SINGLE_OPTS & options.keys).first
47
+ return unless incorrect_options && options.keys.size > 1
48
+ incompatible_options = (options.keys - [incorrect_options]).map { |i| ":#{i}" }.join(', ')
49
+ exception_message = ":#{incorrect_options} can not be used with other args(#{incompatible_options})"
50
+ fail OmniScrapper::InvalidFieldArgumentsException, exception_message
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,3 @@
1
+ module OmniScrapper
2
+ class CrawlerNotDefinedException < StandardError; end;
3
+ end
@@ -0,0 +1,7 @@
1
+ module OmniScrapper
2
+ class InvalidFieldArgumentsException < StandardError
3
+ def initialize(msg = 'Invalid field arguments')
4
+ super
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ module OmniScrapper
2
+ class UnknownFrameworkException < StandardError
3
+ def initialize(msg = 'Uknown framework. Do not know where to generate directory structure.')
4
+ super
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,7 @@
1
+ module OmniScrapper
2
+ class UnsupportedFrameworkException < StandardError
3
+ def initialize(msg = 'This framework is not supported yet. Consider manual installation.')
4
+ super
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,87 @@
1
+ module OmniScrapper
2
+ module FileUtils
3
+ BASE_NAME = ''
4
+ DIRS = %w( schemas crawlers scrappers normalizers )
5
+
6
+ class << self
7
+ def install
8
+ Dir.mkdir(installation_location)
9
+ DIRS.each do |dir|
10
+ Dir.mkdir("#{installation_location}/#{dir}")
11
+ end
12
+ end
13
+
14
+ def generate_scrapper(name)
15
+ Dir.mkdir("#{installation_location}/scrappers/#{name}")
16
+ File.open("#{installation_location}/scrappers/#{name}/scrapper.rb", "w+") do |f|
17
+ f.write(scrapper_template(name))
18
+ end
19
+
20
+ File.open("#{installation_location}/scrappers/#{name}/scrap_methods.rb", "w+") do |f|
21
+ f.write(scrap_methods_template(name))
22
+ end
23
+ end
24
+
25
+ def userspace_files
26
+ Dir.glob(File.join(installation_location, "**", "*.rb"))
27
+ end
28
+
29
+ def installed?
30
+ Dir.exists?(installation_location)
31
+ end
32
+
33
+ def installation_location
34
+ if hanami?
35
+ 'apps/scrappers'
36
+ elsif rails?
37
+ fail OmniScrapper::UnsupportedFrameworkException, 'Rails is not supported yet.'
38
+ else
39
+ fail OmniScrapper::UnknownFrameworkException
40
+ end
41
+ end
42
+
43
+ def hanami?
44
+ defined? Hanami
45
+ end
46
+
47
+ def rails?
48
+ defined? Rails
49
+ end
50
+
51
+ def scrapper_template(name)
52
+ <<-TEMPLATE
53
+ require_relative '../gallery'
54
+ require_relative '../schema'
55
+
56
+ # Usage example:
57
+ # Scrappers::#{name}::Scrapper.run { |data| p data }
58
+ module Scrappers
59
+ module #{name}
60
+ class Scrapper < ::Scrappers::Gallery
61
+ include OmniScrapper
62
+
63
+ setup do |config|
64
+ config.schema ::Scrappers::Schema
65
+ config.crawler ::Scrappers::Gallery
66
+
67
+ config.entrypoint ''
68
+ config.next_page_link ''
69
+
70
+ config.field :name,
71
+ selector: ''
72
+ end
73
+ end
74
+ end
75
+ end
76
+ TEMPLATE
77
+ end
78
+
79
+ def scrap_methods_template(name)
80
+ <<-TEMPLATE
81
+ module ScrapMethods
82
+ end
83
+ TEMPLATE
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,6 @@
1
+ require 'omni_scrapper/normalizers/phone'
2
+
3
+ module OmniScrapper
4
+ module Normalizers
5
+ end
6
+ end
@@ -0,0 +1,15 @@
1
+ module OmniScrapper
2
+ module Normalizers
3
+ class Base
4
+ attr_accessor :value
5
+
6
+ def initialize(value)
7
+ self.value = value
8
+ end
9
+
10
+ def normalized
11
+ fail 'Implement in child class'
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,12 @@
1
+ require_relative 'base'
2
+
3
+ module OmniScrapper
4
+ module Normalizers
5
+ class Phone < Base
6
+ def normalized
7
+ # TODO: raise exception if phone not presetn
8
+ value.scan(/([0-9 \-\(\)]{10,})/).flatten.last.gsub(/[ \-\(\)]/, '')
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,92 @@
1
+ module OmniScrapper
2
+ class Page
3
+ attr_accessor :page, :config
4
+
5
+ def initialize(page, config)
6
+ self.page = page
7
+ self.config = config
8
+ end
9
+
10
+ def data
11
+ result_data = prepare_data
12
+ validate_data!(result_data)
13
+ end
14
+
15
+ private
16
+
17
+ # TODO: should be moved to scrapper
18
+ def prepare_data
19
+ config.fields.reduce({}) do |result, (field_name, field_options)|
20
+ value = get_field(field_options)
21
+ result.merge(field_name => value)
22
+ end.merge(id_within_site: id_within_site)
23
+ end
24
+
25
+ def validate_data!(result_data)
26
+ # TODO: Fix this is not working
27
+ # schema_pattern method is not available here
28
+ # uncomment when move this to scrapper level
29
+ config.schema.new.validate!(result_data) if config.schema
30
+ result_data
31
+ end
32
+
33
+ def id_within_site
34
+ extract(page.uri.to_s, config.anchors[:id_within_site][:pattern])
35
+ end
36
+
37
+ def get_field(options)
38
+ return __send__(options[:method], page) if options[:method]
39
+ return options[:do].call(page) if options[:do]
40
+
41
+ value = find(options[:selector])
42
+ value = normalize(value, options[:normalizer])
43
+ value = extract(value, options[:pattern])
44
+ value = type_cast(value, options[:type_cast_to])
45
+ value
46
+ end
47
+
48
+ def find(selector)
49
+ selector = selector.gsub('/tbody', '')
50
+ page.xpath(selector).text.strip
51
+ end
52
+
53
+ def extract(value, pattern)
54
+ return value unless pattern
55
+ value.scan(pattern).flatten.first
56
+ end
57
+
58
+ def type_cast(value, type_class)
59
+ return value unless type_class
60
+
61
+ case type_class.to_s
62
+ when 'Integer'
63
+ value.to_i
64
+ else
65
+ value
66
+ end
67
+ end
68
+
69
+ def normalize(value, normalizer)
70
+ case normalizer
71
+ when Class
72
+ normalizer.new(value).normalized
73
+ when Symbol
74
+ normalizer_name = normalizer.to_s.split('_').map { |w| w.capitalize }.join
75
+ normalizer_class = normalizers_namespace.const_get(normalizer_name)
76
+ normalizer_class.new(value).normalized
77
+ when Proc
78
+ normalizer.call(value)
79
+ when NilClass
80
+ value
81
+ end
82
+ end
83
+
84
+ def normalizers_namespace
85
+ root_namespace.const_get('Normalizers')
86
+ end
87
+
88
+ def root_namespace
89
+ @root_namespace ||= Kernel.const_get('OmniScrapper')
90
+ end
91
+ end
92
+ end
@@ -0,0 +1,6 @@
1
+ spec = Gem::Specification.find_by_name 'omni_scrapper'
2
+ Dir["#{spec.gem_dir}/lib/tasks/*.rake"].each do |path|
3
+ load path
4
+ end
5
+
6
+
@@ -0,0 +1,29 @@
1
+ module OmniScrapper
2
+ class Result
3
+ attr_accessor :scrapper_name, :data, :timestamp, :checksum
4
+
5
+ def initialize(scrapper_name)
6
+ self.scrapper_name = scrapper_name
7
+ self.timestamp = Time.now
8
+ end
9
+
10
+ def build(data)
11
+ self.data = data
12
+ self.checksum = Signature.new(data).calculate
13
+ end
14
+
15
+ private
16
+
17
+ class Signature
18
+ attr_accessor :data
19
+
20
+ def initialize(data)
21
+ self.data = data
22
+ end
23
+
24
+ def calculate
25
+ Digest::MD5.hexdigest(data.to_s)
26
+ end
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,13 @@
1
+ require 'json-schema'
2
+
3
+ module OmniScrapper
4
+ class Schema
5
+ def validate!(data)
6
+ JSON::Validator.validate!(schema, data)
7
+ end
8
+
9
+ def schema
10
+ fail 'Implement in child class'
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,35 @@
1
+ module OmniScrapper
2
+ def self.included(base)
3
+ base.extend(OmniScrapper::ClassMethods)
4
+ end
5
+
6
+ class << self
7
+ def setup(scrapper_name)
8
+ config = OmniScrapper::Configuration.new
9
+ yield(config)
10
+ OmniScrapper::ScrapperBuilder.new(scrapper_name, config).define_classes
11
+ end
12
+
13
+ def scrappers
14
+ ObjectSpace.each_object(Class).select { |klass| klass < self }
15
+ end
16
+ end
17
+
18
+ module ClassMethods
19
+ def run(&block)
20
+ new.run(&block)
21
+ end
22
+ end
23
+
24
+ def initialize(entrypoint_url = nil)
25
+ self.entrypoint = entrypoint_url || entrypoint_pattern
26
+ self.agent = Mechanize.new do |a|
27
+ a.user_agent_alias = 'Mac Safari'
28
+ end
29
+ end
30
+
31
+ def scrap_page(page)
32
+ data = scrapper_page_class.new(page, configuration).data
33
+ OmniScrapper::Result.new(name).tap { |result| result.build(data) }
34
+ end
35
+ end
@@ -0,0 +1,75 @@
1
+ module OmniScrapper
2
+ class ScrapperBuilder
3
+ attr_accessor :scrapper_name, :config
4
+
5
+ def initialize(scrapper_name, config)
6
+ self.scrapper_name = scrapper_name
7
+ self.config = config
8
+ end
9
+
10
+ def define_classes
11
+ define_scrapper_class(scrapper_name, config)
12
+ define_page_class
13
+ end
14
+
15
+ private
16
+
17
+ def define_scrapper_class(scrapper_name, config)
18
+ current_module = scrapper_module
19
+ klass = Class.new(config.crawler) do
20
+ include OmniScrapper
21
+
22
+ config.anchors.each do |name, options|
23
+ define_method("#{name}_pattern") do
24
+ options[:pattern]
25
+ end
26
+ end
27
+
28
+ define_method :configuration do
29
+ config
30
+ end
31
+
32
+ define_method :name do
33
+ scrapper_name
34
+ end
35
+
36
+ define_method :scrapper_page_class do
37
+ current_module.const_get('Page')
38
+ end
39
+ end
40
+
41
+ Object.const_set(scrappers_namespace_name, Module.new) unless defined? scrappers_namespace_module
42
+ scrappers_namespace_module.const_set(scrapper_name.to_s.capitalize, Module.new) unless defined? scrapper_module
43
+ scrapper_module.const_set(scrapper_class_name, klass)
44
+ end
45
+
46
+ def define_page_class
47
+ page_class = scrapper_module.const_set('Page', Class.new(OmniScrapper::Page))
48
+ page_class.__send__(:include, class_methods_module) if scrapper_module.const_defined?('ScrapMethods')
49
+ end
50
+
51
+ def scrappers_namespace_module
52
+ @scrappers_namespace_module ||= Object.const_get(scrappers_namespace_name)
53
+ end
54
+
55
+ def scrappers_namespace_name
56
+ :Scrappers
57
+ end
58
+
59
+ def scrapper_class_name
60
+ :Scrapper
61
+ end
62
+
63
+ def scrapper_module
64
+ @scrapper_module ||= Object.const_get("#{scrappers_namespace_name}::#{scrapper_name.capitalize}")
65
+ end
66
+
67
+ def class_methods_module
68
+ @class_methods_module ||= Object.const_get scrapper_module_array.push('ScrapMethods').join('::')
69
+ end
70
+
71
+ def scrapper_module_array
72
+ [scrappers_namespace_name.to_s, scrapper_name.to_s.capitalize]
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,13 @@
1
+ require 'omni_scrapper/file_utils'
2
+
3
+ namespace :omni_scrapper do
4
+ desc 'Generate scrapper'
5
+ task :generate, :name do |t, args|
6
+ if OmniScrapper::FileUtils.installed?
7
+ OmniScrapper::FileUtils.generate_scrapper(args[:name])
8
+ p 'Scrapper is generated'
9
+ else
10
+ p 'OmniScrapper is not installed. Please run `rake omni_scrapper:install` before.'
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,17 @@
1
+ require 'omni_scrapper/file_utils'
2
+
3
+ namespace :omni_scrapper do
4
+ desc 'Install'
5
+ task :install do
6
+ if defined? Hanami
7
+ p 'Hanami framework detected.'
8
+
9
+ if OmniScrapper::FileUtils.installed?
10
+ p 'OmniScrapper is already installed'
11
+ else
12
+ p 'Installing'
13
+ OmniScrapper::FileUtils.install
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,13 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = "omni_scrapper"
3
+ s.version = "1.0.1"
4
+ s.licenses = ['MIT']
5
+ s.summary = "This is an example!"
6
+ s.description = "Much longer explanation of the example!"
7
+ s.author = "Stanislav Mekhonoshin"
8
+ s.email = "ejabberd@gmail.com"
9
+ s.require_paths = ["lib"]
10
+ s.files = `git ls-files`.split($/)
11
+
12
+ s.add_runtime_dependency "mechanize"
13
+ end
data/todo.md ADDED
@@ -0,0 +1,3 @@
1
+ * Different enqueue policies(for specific scrappers)
2
+ * Additional optional fields(comments to services)
3
+ * Infer fields types from schema, by default
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: omni_scrapper
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Stanislav Mekhonoshin
@@ -29,7 +29,27 @@ email: ejabberd@gmail.com
29
29
  executables: []
30
30
  extensions: []
31
31
  extra_rdoc_files: []
32
- files: []
32
+ files:
33
+ - lib/omni_scrapper.rb
34
+ - lib/omni_scrapper/configuration.rb
35
+ - lib/omni_scrapper/exceptions/crawler_not_defined_exception.rb
36
+ - lib/omni_scrapper/exceptions/invalid_field_arguments_exception.rb
37
+ - lib/omni_scrapper/exceptions/unknown_framework_exception.rb
38
+ - lib/omni_scrapper/exceptions/unsupported_framework_exception.rb
39
+ - lib/omni_scrapper/file_utils.rb
40
+ - lib/omni_scrapper/normalizers.rb
41
+ - lib/omni_scrapper/normalizers/base.rb
42
+ - lib/omni_scrapper/normalizers/phone.rb
43
+ - lib/omni_scrapper/page.rb
44
+ - lib/omni_scrapper/rake.rb
45
+ - lib/omni_scrapper/result.rb
46
+ - lib/omni_scrapper/schema.rb
47
+ - lib/omni_scrapper/scrapper.rb
48
+ - lib/omni_scrapper/scrapper_builder.rb
49
+ - lib/tasks/generate.rake
50
+ - lib/tasks/install.rake
51
+ - omni_scrapper.gemspec
52
+ - todo.md
33
53
  homepage:
34
54
  licenses:
35
55
  - MIT