tango-etl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0ca6376b79e444981adcf8ee8c66f4cd2a67d6d5
4
+ data.tar.gz: 0e567db2d98e6e1d7fc70c308e26d0959ab82541
5
+ SHA512:
6
+ metadata.gz: 8d51e56712c4aecaca7bd5d1b67e9fad5b5846ed888d83a4cbc6265364c0e7f69dcf8a7f2cb0513bad4324c6ac003125ed7e56a0416f5cf9cd543a1c1a6b96eb
7
+ data.tar.gz: 7ec825c8bc994fd1d0d765d6345e3c39f8fb2f75bc4559663816ebd0cf5aefe0be6df2ec7ee397ba186336bd0810577500bc64d6e3292cbd828735f35c92ef50
data/Gemfile ADDED
@@ -0,0 +1,19 @@
1
+ source 'https://rubygems.org'
2
+
3
+ ruby '2.0.0'
4
+
5
+ gem 'nokogiri', '~> 1.6.1'
6
+ gem 'httparty', '~> 0.13.1'
7
+ gem 'activerecord', '~> 4.1.0'
8
+ gem 'activerecord-import', '~> 0.5.0'
9
+ gem 'ar-multidb', '~> 0.1.12'
10
+
11
+ group :development do
12
+ gem "yard", "~> 0.8.7"
13
+ end
14
+
15
+ group :test do
16
+ gem "shoulda", "~> 3.5.0"
17
+ gem "mocha", "~> 1.0.0"
18
+ gem 'activerecord-nulldb-adapter', '~> 0.3.1'
19
+ end
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ require 'rake/testtask'
2
+
3
+ # By default go with run task
4
+ task :default => "test:unit"
5
+
6
+ namespace :test do
7
+
8
+ Rake::TestTask.new( :unit ) do |t|
9
+ t.test_files = FileList[ 'test/unit/test_*.rb', 'test/unit/*/test_*.rb' ]
10
+ end
11
+
12
+ end
data/changelog.md ADDED
@@ -0,0 +1,4 @@
1
+ #Changelog
2
+
3
+ ###0.1.0 - April 2014
4
+ - Tango separated from [UEK-Tango](https://bitbucket.org/Mckomo/uek-tango/) as an independent gem
@@ -0,0 +1,6 @@
1
+ production:
2
+ site_index: http://example.com/
3
+ sleep: 0.5
4
+ development:
5
+ site_index: http://localhost:8000/
6
+ sleep: 0
@@ -0,0 +1,27 @@
1
+ development:
2
+ adapter: mysql2
3
+ database: tango_dev_master
4
+ username: root
5
+ password: pass
6
+ host: localhost
7
+ multidb:
8
+ databases:
9
+ slave:
10
+ database: tango_dev_slave
11
+ production:
12
+ adapter: mysql2
13
+ database: tango_prod_master
14
+ username: root
15
+ password: pass
16
+ host: localhost
17
+ multidb:
18
+ databases:
19
+ slave:
20
+ database: tango_prod_slave
21
+ test:
22
+ adapter: mysql2
23
+ database: tango_test
24
+ username: root
25
+ password: pass
26
+ host: localhost
27
+
@@ -0,0 +1,53 @@
1
+ module Tango
2
+
3
+ # Base model for Tango resources
4
+ #
5
+ # @author Mckomo
6
+ class AbstractModel < ::ActiveRecord::Base
7
+
8
+ # Required by ActiveRecord
9
+ self.abstract_class = true
10
+
11
+ @properties = nil
12
+ @last_id = nil
13
+
14
+ # Return array with values of model properties
15
+ #
16
+ # @return [Array]
17
+ def values
18
+ self.attributes.values
19
+ end
20
+
21
+ # Return cache key of model instance
22
+ #
23
+ # @return [Object]
24
+ def cache_key
25
+ raise NotImplementedError
26
+ end
27
+
28
+ # Return array with names of model properties
29
+ #
30
+ # @return [Array]
31
+ def self.properties
32
+ @properties || @properties = self.attribute_names.map { |a| a.to_sym }
33
+ end
34
+
35
+ # Return incremented value of last id in the model's table
36
+ #
37
+ # @return [Integer]
38
+ def self.next_id
39
+ @last_id ||= self.pluck( :id ).last || 0
40
+ @last_id += 1
41
+ end
42
+
43
+ # State wether model should be cached or not
44
+ #
45
+ # @return [Boolean]
46
+ def self.persistent?
47
+ raise NotImplementedError
48
+ end
49
+
50
+ end
51
+
52
+ end
53
+
data/lib/tango/app.rb ADDED
@@ -0,0 +1,202 @@
1
+ require 'tango/version'
2
+
3
+ module Tango
4
+
5
+ # Interface for Tango app runtime filters
6
+ #
7
+ # @author Mckomo
8
+ class App
9
+
10
+ attr_reader :config, :dispatcher, :link_stack, :logger
11
+
12
+ # @param link_stack [Tango::LinkStack]
13
+ # @param dispatcher [Tango::Etl::Dispatcher]
14
+ # @param cache [Tango::Resources::Cache]
15
+ # @param http_client [Object] Must implement get method
16
+ # @param parser [Object] Must implement parse method
17
+ # @param db_locker [DatabaseLocker]
18
+ # @param logger [Logger]
19
+ # @return [Tango::App]
20
+ def initialize( config: {}, link_stack: nil, dispatcher: nil, cache: nil, http_client: nil, parser: nil, db_locker: nil, logger: nil )
21
+
22
+ # Init app properties
23
+ @models = {}
24
+ @operators = {}
25
+
26
+ # Set config
27
+ @config = config
28
+
29
+ # Set dependencies
30
+ @link_stack = link_stack || LinkStack.new( config['target_url'] )
31
+ @dispatcher = dispatcher || ETL::Dispatcher.new
32
+ @cache = cache || Resource::Cache.new( Resource::Buffer.new )
33
+ @http_client = http_client || HTTParty
34
+ @parser = parser || Nokogiri::HTML
35
+ @db_locker = db_locker || DatabaseLocker.new( Multidb.databases )
36
+ @logger = logger || Logger.new( STDOUT )
37
+
38
+ end
39
+
40
+ # Filter run before Tango execution
41
+ def before
42
+ raise NotImplementedError
43
+ end
44
+
45
+ # Filter run after Tango execution
46
+ def after
47
+ raise NotImplementedError
48
+ end
49
+
50
+ # Register new resource model
51
+ #
52
+ # @param symbol [Symbol]
53
+ # @param model [Class]
54
+ def register_model( symbol, model )
55
+
56
+ @models[symbol] = model
57
+
58
+ # Truncate table of non persistent model
59
+ unless model.persistent?
60
+ ActiveRecord::Base.connection.execute( "TRUNCATE #{model.table_name}" )
61
+ end
62
+
63
+ end
64
+
65
+ # Register new resource operator
66
+ #
67
+ # @param symbol [Symbol]
68
+ # @param operator [Class]
69
+ def register_operator( symbol, operator )
70
+
71
+ @operators[symbol] = operator
72
+
73
+ # Register operator with resource cache system
74
+ @cache.register( symbol ) do |resource|
75
+ operator.load( resource )
76
+ end
77
+
78
+ end
79
+
80
+ # Run ETL process
81
+ #
82
+ # @param link_stack [Tango::LinkStack]
83
+ # @param dispatcher [Tango::Etl::Dispatcher]
84
+ # @param cache [Tango::Resources::Cache]
85
+ # @param http_client [Object] Must implement get method
86
+ # @param parser [Object] Must implement parse method
87
+ # @param logger [Logger]
88
+ # @return [Integer]
89
+ def run
90
+
91
+ # Save beginning time
92
+ start_time = Time.now
93
+
94
+ @logger.info "Running Tango v.#{Tango::VERSION} ..."
95
+ @logger.info "Target: #{@link_stack.host}."
96
+
97
+ # Use next unlocked database
98
+ Multidb.use( @db_locker.unlocked )
99
+ @logger.info "Using database '#{@db_locker.unlocked}'."
100
+
101
+ # Run before filter
102
+ @logger.info "Loading cache ..."
103
+ load_cache
104
+
105
+ # Run before filter
106
+ @logger.info "Running before callback ..."
107
+ before
108
+
109
+ # Init counter of crawled links
110
+ links_counter = 0
111
+ @logger.info "Tango starts crawling ..."
112
+
113
+ # Start crawling website
114
+ while( @link_stack.has_links? )
115
+
116
+ # Get a link from the stack
117
+ link = @link_stack.shift
118
+
119
+ # Skip iteration if no handler found
120
+ if ! handler_klass = @dispatcher.find_handler( link )
121
+ @logger.error "No handler for link: #{link}."
122
+ next
123
+ end
124
+
125
+ # Try to get contents of the link
126
+ begin
127
+ response = @http_client.get( @link_stack.host + link )
128
+ rescue StandardError => e
129
+ @logger.error "Could not download contents of #{@link_stack.host + link} link."; @logger.error e.message
130
+ next
131
+ end
132
+
133
+ # Continue only when response has code 200 or 201
134
+ if ! [ 200, 201 ].include?( response.code )
135
+ @logger.error "Response code for link #{link} is #{response.code}. Only code 200 is accepted."
136
+ next
137
+ end
138
+
139
+ # Use Nokogiri to parse response contents
140
+ document = @parser.parse( response.body )
141
+
142
+ # Init handler
143
+ handler = handler_klass.new( link, document, @cache )
144
+
145
+ # Append links fetched from hanlder
146
+ @link_stack.append( handler.links )
147
+
148
+ # Try to fire the handler
149
+ begin
150
+ handler.trigger
151
+ rescue StandardError => e
152
+ # Log error
153
+ @logger.error "Link: #{link}. Handler had some troubles."
154
+ @logger.error e.message
155
+ @logger.error e.backtrace.join( "\n" )
156
+ else
157
+ links_counter += 1
158
+ @logger.debug "Link: #{link}. Handler triggered successfully."
159
+ end
160
+
161
+ # Sleep to give crawled server time to breath
162
+ sleep( @config["sleep"] || 0 )
163
+
164
+ end
165
+
166
+ # Release buffers
167
+ @logger.info "Releasing buffers ..."
168
+ @cache.buffer.release_all()
169
+
170
+ # Run after filter
171
+ @logger.info "Running after callback ..."
172
+ after
173
+
174
+ # Lock database used in this Tango iteration
175
+ @db_locker.lock( @db_locker.unlocked )
176
+
177
+ # Get time of script execution ending
178
+ end_time = Time.now
179
+
180
+ @logger.info "Tango crawled #{links_counter}/#{@link_stack.shifted} links successfully."
181
+ @logger.info "Start time: #{start_time}, end time: #{end_time}, time elapsed: #{end_time - start_time} seconds."
182
+
183
+ # Close logger
184
+ @logger.close
185
+
186
+ end
187
+
188
+ private
189
+
190
+ def load_cache
191
+
192
+ @models.each do |symbol, model|
193
+ model.all.each do |m|
194
+ @cache.set( symbol, m )
195
+ end if model.persistent?
196
+ end
197
+
198
+ end
199
+
200
+ end
201
+
202
+ end
@@ -0,0 +1,49 @@
1
+ module Tango
2
+
3
+ class DatabaseLocker
4
+
5
+ attr_reader :lock_path
6
+
7
+ #
8
+ #
9
+ def initialize( candidates = [], lock_path = "./tmp/database.lock" )
10
+ @candidates = candidates
11
+ @lock_path = lock_path
12
+ end
13
+
14
+ # Return next unlocked database
15
+ def unlocked
16
+ @unlocked ||= find_unlocked
17
+ end
18
+
19
+ def lock( database )
20
+
21
+ @unlocked = nil
22
+
23
+ File.open( lock_path, "w" ) do |f|
24
+ f.write( database )
25
+ end
26
+
27
+ self
28
+
29
+ end
30
+
31
+ private
32
+
33
+ def find_unlocked
34
+
35
+ lock = File.open( lock_path, 'a+' ) { |f| f.read.strip.gsub(/\s+/, ' ') }
36
+
37
+ # If some database was locked use next one
38
+ unless lock.empty? or ! @candidates.include?( lock )
39
+ @candidates.at( @candidates.index( lock ).next % @candidates.length )
40
+ # Otherwise return first one
41
+ else
42
+ @candidates.first
43
+ end
44
+
45
+ end
46
+
47
+ end
48
+
49
+ end
@@ -0,0 +1,49 @@
1
+ module Tango
2
+ module ETL
3
+
4
+ # Dispatcher of handlers
5
+ #
6
+ # @author Mckomo
7
+ class Dispatcher
8
+
9
+ def initialize
10
+ @handlers = []
11
+ end
12
+
13
+ # Register new handler
14
+ #
15
+ # @param handler_class [HandlerInterface] Class that implements HandlerInterface
16
+ # @return [Dispatcher]
17
+ def register( handler_class )
18
+
19
+ # handler must implement HandlerInterface
20
+ unless handler_class.ancestors.include? Tango::ETL::HandlerInterface
21
+ raise "Handler must implement HandlerInterface"
22
+ end
23
+
24
+ # Append handler to container
25
+ @handlers << handler_class
26
+
27
+ self # Chainabilty!
28
+
29
+ end
30
+
31
+ # Find first applicable handler
32
+ #
33
+ # @param url [String] URL of the page to be handled
34
+ # @return [HandlerInterface]
35
+ def find_handler( url )
36
+
37
+ # Iterate handlers to find first matching handler
38
+ @handlers.each do |h|
39
+ return h if h.applicable?( url )
40
+ end
41
+
42
+ nil
43
+
44
+ end
45
+
46
+ end
47
+
48
+ end
49
+ end
@@ -0,0 +1,40 @@
1
+ module Tango
2
+
3
+ module ETL
4
+
5
+ # Handler interface
6
+ #
7
+ # @author Mckomo
8
+ class HandlerInterface
9
+
10
+ # Constructor of Tango's handler
11
+ #
12
+ #
13
+ def initialize( link, document, cache = nil )
14
+ @link = link
15
+ @document = document
16
+ @cache = cache
17
+ end
18
+
19
+ #
20
+ #
21
+ # @return [Array|String]
22
+ def links
23
+ raise NotImplementedError
24
+ end
25
+
26
+ #
27
+ #
28
+ # @return [NilClass]
29
+ def trigger
30
+ raise NotImplementedError
31
+ end
32
+
33
+ def self.applicable?( link )
34
+ raise NotImplementedError
35
+ end
36
+
37
+ end
38
+
39
+ end
40
+ end
@@ -0,0 +1,36 @@
1
+ module Tango
2
+
3
+ module ETL
4
+
5
+ # Interface of an operator that implements ETL pattern
6
+ #
7
+ # @author Mckomo
8
+ class OperatorInterface
9
+
10
+ # Extract resource params
11
+ #
12
+ # @param element [Object] Element from witch resources should be extracted
13
+ # @return [Object] Extracted resource or array with resources
14
+ def self.extract( element )
15
+ raise NotImplementedError
16
+ end
17
+
18
+ # Transform resource params to desired state
19
+ #
20
+ # @param resource [Object] Resource or array with resources
21
+ # @return [Object] Transformed resource or array with resources
22
+ def self.transform( resource )
23
+ raise NotImplementedError
24
+ end
25
+
26
+ # Load resources into a storage
27
+ #
28
+ # @param resources [Array] Batch of resources to load
29
+ def self.load( resources )
30
+ raise NotImplementedError
31
+ end
32
+
33
+ end
34
+
35
+ end
36
+ end
data/lib/tango/etl.rb ADDED
@@ -0,0 +1,3 @@
1
+ require 'tango/etl/dispatcher'
2
+ require 'tango/etl/handler_interface'
3
+ require 'tango/etl/operator_interface'
@@ -0,0 +1,36 @@
1
+ module Tango
2
+ module Kernel
3
+
4
+ # Convert file path to class name
5
+ # @param file_path [String]
6
+ # @return [String]
7
+ def self.classify( file_path )
8
+ File.basename( file_path, ".*" ).split( "_" ).map { |w| w.capitalize }.join
9
+ end
10
+
11
+ # Load class from a file
12
+ #
13
+ # @param file [String]
14
+ # @param module_prefix [String]
15
+ # @return [Class]
16
+ def self.load( file, module_prefix = "" )
17
+
18
+ require file
19
+
20
+ class_name = Kernel.classify( file )
21
+ Kernel.const_get( "#{module_prefix}#{class_name}" )
22
+
23
+ end
24
+
25
+ # Obtain symbol of a class
26
+ # @example
27
+ # Tango::Kernel.symblize( A::B::SuperKlass ) #=> :super_klass
28
+ #
29
+ # @param klass [Class]
30
+ # @return [Symbol]
31
+ def self.symbolize( klass )
32
+ klass.to_s.split( '::' ).last.gsub( /(.)([A-Z])/ ,'\1_\2' ).downcase.to_sym
33
+ end
34
+
35
+ end
36
+ end
@@ -0,0 +1,61 @@
1
+ # Load system lib
2
+ require 'uri'
3
+
4
+ module Tango
5
+
6
+ # Stack of links to be crawled
7
+ #
8
+ # @author Mckomo
9
+ class LinkStack
10
+
11
+ attr_reader :host, :links, :shifted
12
+
13
+ def initialize( base_link )
14
+
15
+ if base_link !~ URI::regexp
16
+ raise ArgumentError, "'#{base_link}' is not valid website URL."
17
+ end
18
+
19
+ # Parse base link
20
+ url = URI( base_link )
21
+
22
+ @host = "#{url.scheme}://#{url.host}:#{url.port}" # Extract host from base link
23
+ @links = [] # Container for links (without host part)
24
+ @shifted = 0 # Shifted links counter
25
+
26
+ # Extract path (with query) from base link and append it as initial link
27
+ path = url.query ? "#{url.path}?#{url.query}" : url.path
28
+ append( path )
29
+
30
+ end
31
+
32
+ # Shift link from stack and get referrer content
33
+ #
34
+ # @return [String]
35
+ def shift
36
+ return unless has_links?
37
+ @shifted += 1
38
+ @links.shift
39
+ end
40
+
41
+
42
+ # Append link/s to stack
43
+ #
44
+ # @return [Array|String]
45
+ def append( links )
46
+ if links.is_a? String
47
+ @links << links
48
+ elsif links.is_a? Array
49
+ @links += links
50
+ end
51
+ end
52
+
53
+ # Check if link stack still has links
54
+ #
55
+ # @return [Boolean]
56
+ def has_links?
57
+ ! @links.empty?
58
+ end
59
+
60
+ end
61
+ end
@@ -0,0 +1,11 @@
1
+ module Multidb
2
+
3
+ class Balancer
4
+ attr_reader :candidates
5
+ end
6
+
7
+ def self.databases
8
+ balancer.candidates.keys
9
+ end
10
+
11
+ end
@@ -0,0 +1,74 @@
1
+ module Tango
2
+ module Resource
3
+
4
+ # Resource buffer
5
+ #
6
+ # @author Mckomo
7
+ class Buffer
8
+
9
+ # Constructor of the Buffer
10
+ #
11
+ # @param size [Integer]
12
+ # @return [Tango::Resources::Buffer]
13
+ def initialize( size = 500 )
14
+
15
+ # Set max size of the buffer
16
+ @size = size
17
+
18
+ # Init container for resources buffer
19
+ @resources = {}
20
+ # Init container for resource operators classes
21
+ @callbacks = {}
22
+
23
+ end
24
+
25
+ # Register new type of resource to be buffered
26
+ #
27
+ # @param type [Symbol]
28
+ # @param release_callback [Proc]
29
+ def register( type, &release_callback )
30
+
31
+ raise ArgumentError, "No release callback given" unless block_given?
32
+
33
+ @resources[type] = []
34
+ @callbacks[type] = release_callback
35
+
36
+ end
37
+
38
+ # Fill buffer with a resource
39
+ #
40
+ # @param type [Symbol]
41
+ # @param resource [Object]
42
+ def fill( type, resource )
43
+
44
+ raise ArgumentError, "Trying to fill object with unregistered type" unless @resources.keys.include?( type )
45
+
46
+ # Append resource to the buffer
47
+ @resources[type] << resource
48
+ # Release the buffer if buffer size exceeded
49
+ release( type ) if @resources[type].count >= @size
50
+
51
+ end
52
+
53
+ # Release all registered buffers
54
+ def release_all
55
+ @resources.keys.each { |type| release( type ) }
56
+ end
57
+
58
+ private
59
+
60
+ # Release buffer with given type
61
+ #
62
+ # @param type [Symbol]
63
+ def release( type )
64
+ # Trigger callback on released resources
65
+ @callbacks[type].tap do |c|
66
+ c.call( @resources[type] )
67
+ end
68
+ @resources[type].clear # Clear resources from the buffer
69
+ end
70
+
71
+ end
72
+
73
+ end
74
+ end