tango-etl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0ca6376b79e444981adcf8ee8c66f4cd2a67d6d5
4
+ data.tar.gz: 0e567db2d98e6e1d7fc70c308e26d0959ab82541
5
+ SHA512:
6
+ metadata.gz: 8d51e56712c4aecaca7bd5d1b67e9fad5b5846ed888d83a4cbc6265364c0e7f69dcf8a7f2cb0513bad4324c6ac003125ed7e56a0416f5cf9cd543a1c1a6b96eb
7
+ data.tar.gz: 7ec825c8bc994fd1d0d765d6345e3c39f8fb2f75bc4559663816ebd0cf5aefe0be6df2ec7ee397ba186336bd0810577500bc64d6e3292cbd828735f35c92ef50
data/Gemfile ADDED
@@ -0,0 +1,19 @@
1
+ source 'https://rubygems.org'
2
+
3
+ ruby '2.0.0'
4
+
5
+ gem 'nokogiri', '~> 1.6.1'
6
+ gem 'httparty', '~> 0.13.1'
7
+ gem 'activerecord', '~> 4.1.0'
8
+ gem 'activerecord-import', '~> 0.5.0'
9
+ gem 'ar-multidb', '~> 0.1.12'
10
+
11
+ group :development do
12
+ gem "yard", "~> 0.8.7"
13
+ end
14
+
15
+ group :test do
16
+ gem "shoulda", "~> 3.5.0"
17
+ gem "mocha", "~> 1.0.0"
18
+ gem 'activerecord-nulldb-adapter', '~> 0.3.1'
19
+ end
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ require 'rake/testtask'
2
+
3
+ # By default go with run task
4
+ task :default => "test:unit"
5
+
6
+ namespace :test do
7
+
8
+ Rake::TestTask.new( :unit ) do |t|
9
+ t.test_files = FileList[ 'test/unit/test_*.rb', 'test/unit/*/test_*.rb' ]
10
+ end
11
+
12
+ end
data/changelog.md ADDED
@@ -0,0 +1,4 @@
1
+ #Changelog
2
+
3
+ ###0.1.0 - April 2014
4
+ - Tango separated from [UEK-Tango](https://bitbucket.org/Mckomo/uek-tango/) as an independent gem
@@ -0,0 +1,6 @@
1
+ production:
2
+ site_index: http://example.com/
3
+ sleep: 0.5
4
+ development:
5
+ site_index: http://localhost:8000/
6
+ sleep: 0
@@ -0,0 +1,27 @@
1
+ development:
2
+ adapter: mysql2
3
+ database: tango_dev_master
4
+ username: root
5
+ password: pass
6
+ host: localhost
7
+ multidb:
8
+ databases:
9
+ slave:
10
+ database: tango_dev_slave
11
+ production:
12
+ adapter: mysql2
13
+ database: tango_prod_master
14
+ username: root
15
+ password: pass
16
+ host: localhost
17
+ multidb:
18
+ databases:
19
+ slave:
20
+ database: tango_prod_slave
21
+ test:
22
+ adapter: mysql2
23
+ database: tango_test
24
+ username: root
25
+ password: pass
26
+ host: localhost
27
+
@@ -0,0 +1,53 @@
1
+ module Tango
2
+
3
+ # Base model for Tango resources
4
+ #
5
+ # @author Mckomo
6
+ class AbstractModel < ::ActiveRecord::Base
7
+
8
+ # Required by ActiveRecord
9
+ self.abstract_class = true
10
+
11
+ @properties = nil
12
+ @last_id = nil
13
+
14
+ # Return array with values of model properties
15
+ #
16
+ # @return [Array]
17
+ def values
18
+ self.attributes.values
19
+ end
20
+
21
+ # Return cache key of model instance
22
+ #
23
+ # @return [Object]
24
+ def cache_key
25
+ raise NotImplementedError
26
+ end
27
+
28
+ # Return array with names of model properties
29
+ #
30
+ # @return [Array]
31
+ def self.properties
32
+ @properties || @properties = self.attribute_names.map { |a| a.to_sym }
33
+ end
34
+
35
+ # Return incremented value of last id in the model's table
36
+ #
37
+ # @return [Integer]
38
+ def self.next_id
39
+ @last_id ||= self.pluck( :id ).last || 0
40
+ @last_id += 1
41
+ end
42
+
43
+ # State wether model should be cached or not
44
+ #
45
+ # @return [Boolean]
46
+ def self.persistent?
47
+ raise NotImplementedError
48
+ end
49
+
50
+ end
51
+
52
+ end
53
+
data/lib/tango/app.rb ADDED
@@ -0,0 +1,202 @@
1
+ require 'tango/version'
2
+
3
+ module Tango
4
+
5
+ # Interface for Tango app runtime filters
6
+ #
7
+ # @author Mckomo
8
+ class App
9
+
10
+ attr_reader :config, :dispatcher, :link_stack, :logger
11
+
12
+ # @param link_stack [Tango::LinkStack]
13
+ # @param dispatcher [Tango::Etl::Dispatcher]
14
+ # @param cache [Tango::Resources::Cache]
15
+ # @param http_client [Object] Must implement get method
16
+ # @param parser [Object] Must implement parse method
17
+ # @param db_locker [DatabaseLocker]
18
+ # @param logger [Logger]
19
+ # @return [Tango::App]
20
+ def initialize( config: {}, link_stack: nil, dispatcher: nil, cache: nil, http_client: nil, parser: nil, db_locker: nil, logger: nil )
21
+
22
+ # Init app properties
23
+ @models = {}
24
+ @operators = {}
25
+
26
+ # Set config
27
+ @config = config
28
+
29
+ # Set dependencies
30
+ @link_stack = link_stack || LinkStack.new( config['target_url'] )
31
+ @dispatcher = dispatcher || ETL::Dispatcher.new
32
+ @cache = cache || Resource::Cache.new( Resource::Buffer.new )
33
+ @http_client = http_client || HTTParty
34
+ @parser = parser || Nokogiri::HTML
35
+ @db_locker = db_locker || DatabaseLocker.new( Multidb.databases )
36
+ @logger = logger || Logger.new( STDOUT )
37
+
38
+ end
39
+
40
+ # Filter run before Tango execution
41
+ def before
42
+ raise NotImplementedError
43
+ end
44
+
45
+ # Filter run after Tango execution
46
+ def after
47
+ raise NotImplementedError
48
+ end
49
+
50
+ # Register new resource model
51
+ #
52
+ # @param symbol [Symbol]
53
+ # @param model [Class]
54
+ def register_model( symbol, model )
55
+
56
+ @models[symbol] = model
57
+
58
+ # Truncate table of non persistent model
59
+ unless model.persistent?
60
+ ActiveRecord::Base.connection.execute( "TRUNCATE #{model.table_name}" )
61
+ end
62
+
63
+ end
64
+
65
+ # Register new resource operator
66
+ #
67
+ # @param symbol [Symbol]
68
+ # @param operator [Class]
69
+ def register_operator( symbol, operator )
70
+
71
+ @operators[symbol] = operator
72
+
73
+ # Register operator with resource cache system
74
+ @cache.register( symbol ) do |resource|
75
+ operator.load( resource )
76
+ end
77
+
78
+ end
79
+
80
+ # Run ETL process
81
+ #
82
+ # @param link_stack [Tango::LinkStack]
83
+ # @param dispatcher [Tango::Etl::Dispatcher]
84
+ # @param cache [Tango::Resources::Cache]
85
+ # @param http_client [Object] Must implement get method
86
+ # @param parser [Object] Must implement parse method
87
+ # @param logger [Logger]
88
+ # @return [Integer]
89
+ def run
90
+
91
+ # Save beginning time
92
+ start_time = Time.now
93
+
94
+ @logger.info "Running Tango v.#{Tango::VERSION} ..."
95
+ @logger.info "Target: #{@link_stack.host}."
96
+
97
+ # Use next unlocked database
98
+ Multidb.use( @db_locker.unlocked )
99
+ @logger.info "Using database '#{@db_locker.unlocked}'."
100
+
101
+ # Run before filter
102
+ @logger.info "Loading cache ..."
103
+ load_cache
104
+
105
+ # Run before filter
106
+ @logger.info "Running before callback ..."
107
+ before
108
+
109
+ # Init counter of crawled links
110
+ links_counter = 0
111
+ @logger.info "Tango starts crawling ..."
112
+
113
+ # Start crawling website
114
+ while( @link_stack.has_links? )
115
+
116
+ # Get a link from the stack
117
+ link = @link_stack.shift
118
+
119
+ # Skip iteration if no handler found
120
+ if ! handler_klass = @dispatcher.find_handler( link )
121
+ @logger.error "No handler for link: #{link}."
122
+ next
123
+ end
124
+
125
+ # Try to get contents of the link
126
+ begin
127
+ response = @http_client.get( @link_stack.host + link )
128
+ rescue StandardError => e
129
+ @logger.error "Could not download contents of #{@link_stack.host + link} link."; @logger.error e.message
130
+ next
131
+ end
132
+
133
+ # Continue only when response has code 200 or 201
134
+ if ! [ 200, 201 ].include?( response.code )
135
+ @logger.error "Response code for link #{link} is #{response.code}. Only code 200 is accepted."
136
+ next
137
+ end
138
+
139
+ # Use Nokogiri to parse response contents
140
+ document = @parser.parse( response.body )
141
+
142
+ # Init handler
143
+ handler = handler_klass.new( link, document, @cache )
144
+
145
+ # Append links fetched from hanlder
146
+ @link_stack.append( handler.links )
147
+
148
+ # Try to fire the handler
149
+ begin
150
+ handler.trigger
151
+ rescue StandardError => e
152
+ # Log error
153
+ @logger.error "Link: #{link}. Handler had some troubles."
154
+ @logger.error e.message
155
+ @logger.error e.backtrace.join( "\n" )
156
+ else
157
+ links_counter += 1
158
+ @logger.debug "Link: #{link}. Handler triggered successfully."
159
+ end
160
+
161
+ # Sleep to give crawled server time to breath
162
+ sleep( @config["sleep"] || 0 )
163
+
164
+ end
165
+
166
+ # Release buffers
167
+ @logger.info "Releasing buffers ..."
168
+ @cache.buffer.release_all()
169
+
170
+ # Run after filter
171
+ @logger.info "Running after callback ..."
172
+ after
173
+
174
+ # Lock database used in this Tango iteration
175
+ @db_locker.lock( @db_locker.unlocked )
176
+
177
+ # Get time of script execution ending
178
+ end_time = Time.now
179
+
180
+ @logger.info "Tango crawled #{links_counter}/#{@link_stack.shifted} links successfully."
181
+ @logger.info "Start time: #{start_time}, end time: #{end_time}, time elapsed: #{end_time - start_time} seconds."
182
+
183
+ # Close logger
184
+ @logger.close
185
+
186
+ end
187
+
188
+ private
189
+
190
+ def load_cache
191
+
192
+ @models.each do |symbol, model|
193
+ model.all.each do |m|
194
+ @cache.set( symbol, m )
195
+ end if model.persistent?
196
+ end
197
+
198
+ end
199
+
200
+ end
201
+
202
+ end
@@ -0,0 +1,49 @@
1
+ module Tango
2
+
3
+ class DatabaseLocker
4
+
5
+ attr_reader :lock_path
6
+
7
+ #
8
+ #
9
+ def initialize( candidates = [], lock_path = "./tmp/database.lock" )
10
+ @candidates = candidates
11
+ @lock_path = lock_path
12
+ end
13
+
14
+ # Return next unlocked database
15
+ def unlocked
16
+ @unlocked ||= find_unlocked
17
+ end
18
+
19
+ def lock( database )
20
+
21
+ @unlocked = nil
22
+
23
+ File.open( lock_path, "w" ) do |f|
24
+ f.write( database )
25
+ end
26
+
27
+ self
28
+
29
+ end
30
+
31
+ private
32
+
33
+ def find_unlocked
34
+
35
+ lock = File.open( lock_path, 'a+' ) { |f| f.read.strip.gsub(/\s+/, ' ') }
36
+
37
+ # If some database was locked use next one
38
+ unless lock.empty? or ! @candidates.include?( lock )
39
+ @candidates.at( @candidates.index( lock ).next % @candidates.length )
40
+ # Otherwise return first one
41
+ else
42
+ @candidates.first
43
+ end
44
+
45
+ end
46
+
47
+ end
48
+
49
+ end
@@ -0,0 +1,49 @@
1
+ module Tango
2
+ module ETL
3
+
4
+ # Dispatcher of handlers
5
+ #
6
+ # @author Mckomo
7
+ class Dispatcher
8
+
9
+ def initialize
10
+ @handlers = []
11
+ end
12
+
13
+ # Register new handler
14
+ #
15
+ # @param handler_class [HandlerInterface] Class that implements HandlerInterface
16
+ # @return [Dispatcher]
17
+ def register( handler_class )
18
+
19
+ # handler must implement HandlerInterface
20
+ unless handler_class.ancestors.include? Tango::ETL::HandlerInterface
21
+ raise "Handler must implement HandlerInterface"
22
+ end
23
+
24
+ # Append handler to container
25
+ @handlers << handler_class
26
+
27
+ self # Chainabilty!
28
+
29
+ end
30
+
31
+ # Find first applicable handler
32
+ #
33
+ # @param url [String] URL of the page to be handled
34
+ # @return [HandlerInterface]
35
+ def find_handler( url )
36
+
37
+ # Iterate handlers to find first matching handler
38
+ @handlers.each do |h|
39
+ return h if h.applicable?( url )
40
+ end
41
+
42
+ nil
43
+
44
+ end
45
+
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,40 @@
1
+ module Tango
2
+
3
+ module ETL
4
+
5
+ # Handler interface
6
+ #
7
+ # @author Mckomo
8
+ class HandlerInterface
9
+
10
+ # Constructor of Tango's handler
11
+ #
12
+ #
13
+ def initialize( link, document, cache = nil )
14
+ @link = link
15
+ @document = document
16
+ @cache = cache
17
+ end
18
+
19
+ #
20
+ #
21
+ # @return [Array|String]
22
+ def links
23
+ raise NotImplementedError
24
+ end
25
+
26
+ #
27
+ #
28
+ # @return [NilClass]
29
+ def trigger
30
+ raise NotImplementedError
31
+ end
32
+
33
+ def self.applicable?( link )
34
+ raise NotImplementedError
35
+ end
36
+
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,36 @@
1
+ module Tango
2
+
3
+ module ETL
4
+
5
+ # Interface of an operator that implements ETL pattern
6
+ #
7
+ # @author Mckomo
8
+ class OperatorInterface
9
+
10
+ # Extract resource params
11
+ #
12
+ # @param element [Object] Element from witch resources should be extracted
13
+ # @return [Object] Extracted resource or array with resources
14
+ def self.extract( element )
15
+ raise NotImplementedError
16
+ end
17
+
18
+ # Transform resource params to desired state
19
+ #
20
+ # @param resource [Object] Resource or array with resources
21
+ # @return [Object] Transformed resource or array with resources
22
+ def self.transform( resource )
23
+ raise NotImplementedError
24
+ end
25
+
26
+ # Load resources into a storage
27
+ #
28
+ # @param resources [Array] Batch of resources to load
29
+ def self.load( resources )
30
+ raise NotImplementedError
31
+ end
32
+
33
+ end
34
+
35
+ end
36
+ end
data/lib/tango/etl.rb ADDED
@@ -0,0 +1,3 @@
1
+ require 'tango/etl/dispatcher'
2
+ require 'tango/etl/handler_interface'
3
+ require 'tango/etl/operator_interface'
@@ -0,0 +1,36 @@
1
+ module Tango
2
+ module Kernel
3
+
4
+ # Convert file path to class name
5
+ # @param file_path [String]
6
+ # @return [String]
7
+ def self.classify( file_path )
8
+ File.basename( file_path, ".*" ).split( "_" ).map { |w| w.capitalize }.join
9
+ end
10
+
11
+ # Load class from a file
12
+ #
13
+ # @param file [String]
14
+ # @param module_prefix [String]
15
+ # @return [Class]
16
+ def self.load( file, module_prefix = "" )
17
+
18
+ require file
19
+
20
+ class_name = Kernel.classify( file )
21
+ Kernel.const_get( "#{module_prefix}#{class_name}" )
22
+
23
+ end
24
+
25
+ # Obtain symbol of a class
26
+ # @example
27
+ # Tango::Kernel.symblize( A::B::SuperKlass ) #=> :super_klass
28
+ #
29
+ # @param klass [Class]
30
+ # @return [Symbol]
31
+ def self.symbolize( klass )
32
+ klass.to_s.split( '::' ).last.gsub( /(.)([A-Z])/ ,'\1_\2' ).downcase.to_sym
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,61 @@
1
+ # Load system lib
2
+ require 'uri'
3
+
4
+ module Tango
5
+
6
+ # Stack of links to be crawled
7
+ #
8
+ # @author Mckomo
9
+ class LinkStack
10
+
11
+ attr_reader :host, :links, :shifted
12
+
13
+ def initialize( base_link )
14
+
15
+ if base_link !~ URI::regexp
16
+ raise ArgumentError, "'#{base_link}' is not valid website URL."
17
+ end
18
+
19
+ # Parse base link
20
+ url = URI( base_link )
21
+
22
+ @host = "#{url.scheme}://#{url.host}:#{url.port}" # Extract host from base link
23
+ @links = [] # Container for links (without host part)
24
+ @shifted = 0 # Shifted links counter
25
+
26
+ # Extract path (with query) from base link and append it as initial link
27
+ path = url.query ? "#{url.path}?#{url.query}" : url.path
28
+ append( path )
29
+
30
+ end
31
+
32
+ # Shift link from stack and get referrer content
33
+ #
34
+ # @return [String]
35
+ def shift
36
+ return unless has_links?
37
+ @shifted += 1
38
+ @links.shift
39
+ end
40
+
41
+
42
+ # Append link/s to stack
43
+ #
44
+ # @return [Array|String]
45
+ def append( links )
46
+ if links.is_a? String
47
+ @links << links
48
+ elsif links.is_a? Array
49
+ @links += links
50
+ end
51
+ end
52
+
53
+ # Check if link stack still has links
54
+ #
55
+ # @return [Boolean]
56
+ def has_links?
57
+ ! @links.empty?
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,11 @@
1
+ module Multidb
2
+
3
+ class Balancer
4
+ attr_reader :candidates
5
+ end
6
+
7
+ def self.databases
8
+ balancer.candidates.keys
9
+ end
10
+
11
+ end
@@ -0,0 +1,74 @@
1
+ module Tango
2
+ module Resource
3
+
4
+ # Resource buffer
5
+ #
6
+ # @author Mckomo
7
+ class Buffer
8
+
9
+ # Constructor of the Buffer
10
+ #
11
+ # @param size [Integer]
12
+ # @return [Tango::Resources::Buffer]
13
+ def initialize( size = 500 )
14
+
15
+ # Set max size of the buffer
16
+ @size = size
17
+
18
+ # Init container for resources buffer
19
+ @resources = {}
20
+ # Init container for resource operators classes
21
+ @callbacks = {}
22
+
23
+ end
24
+
25
+ # Register new type of resource to be buffered
26
+ #
27
+ # @param type [Symbol]
28
+ # @param release_callback [Proc]
29
+ def register( type, &release_callback )
30
+
31
+ raise ArgumentError, "No release callback given" unless block_given?
32
+
33
+ @resources[type] = []
34
+ @callbacks[type] = release_callback
35
+
36
+ end
37
+
38
+ # Fill buffer with a resource
39
+ #
40
+ # @param type [Symbol]
41
+ # @param resource [Object]
42
+ def fill( type, resource )
43
+
44
+ raise ArgumentError, "Trying to fill object with unregistered type" unless @resources.keys.include?( type )
45
+
46
+ # Append resource to the buffer
47
+ @resources[type] << resource
48
+ # Release the buffer if buffer size exceeded
49
+ release( type ) if @resources[type].count >= @size
50
+
51
+ end
52
+
53
+ # Release all registered buffers
54
+ def release_all
55
+ @resources.keys.each { |type| release( type ) }
56
+ end
57
+
58
+ private
59
+
60
+ # Release buffer with given type
61
+ #
62
+ # @param type [Symbol]
63
+ def release( type )
64
+ # Trigger callback on released resources
65
+ @callbacks[type].tap do |c|
66
+ c.call( @resources[type] )
67
+ end
68
+ @resources[type].clear # Clear resources from the buffer
69
+ end
70
+
71
+ end
72
+
73
+ end
74
+ end