tango-etl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,81 @@
1
+ module Tango
2
+ module Resource
3
+
4
+ # Key - value caching system for resources
5
+ #
6
+ # @author Mckomo
7
+ class Cache
8
+
9
+ attr_reader :buffer
10
+
11
+ # Constructor of the cache
12
+ #
13
+ # @param buffer [Tango::Resources::Buffer]
14
+ # @return [Tango::Resources::Cache]
15
+ def initialize( buffer = nil )
16
+ # Set dependencies
17
+ @buffer = buffer || Buffer.new
18
+ # Container for resources cache
19
+ @storage = {}
20
+ end
21
+
22
+ # Register new type of resource to be cached
23
+ #
24
+ # @param type [Symbol]
25
+ # @param release_callback [Proc]
26
+ def register( type, &release_callback )
27
+ # Create container for cache of new resource
28
+ @storage[type] = {}
29
+ # Also register new type with buffer
30
+ @buffer.register( type, &release_callback )
31
+ end
32
+
33
+ # Get a resource or use given block to cache and return it's id
34
+ #
35
+ # @param type [Symbol]
36
+ # @param resource [Object]
37
+ # @return [Integer]
38
+ def load( type, resource )
39
+
40
+ # Get resource from cache
41
+ cached_resource = get( type, resource )
42
+
43
+ unless cached_resource
44
+
45
+ raise ArgumentError, "No resource callback given" unless block_given?
46
+ # If not found, execute yield to receive transformed resource
47
+ cached_resource = yield( resource )
48
+ # Cache new resource
49
+ set( type, cached_resource )
50
+ # Fill buffer with newly cached resource
51
+ @buffer.fill( type, cached_resource )
52
+
53
+ end
54
+
55
+ cached_resource
56
+
57
+ end
58
+
59
+ # Setter for the cache storage
60
+ #
61
+ # @param type [Symbol]
62
+ # @param resource [Object]
63
+ # @return [Object]
64
+ def set( type, resource )
65
+ raise ArgumentError, "Trying to set resource with unregistered type" unless @storage.keys.include?( type )
66
+ @storage[type][resource.cache_key] = resource
67
+ end
68
+
69
+ # Getter for the cache storage
70
+ #
71
+ # @param type [Symbol]
72
+ # @param key [String]
73
+ # @return [Object]
74
+ def get( type, resource )
75
+ raise ArgumentError, "Trying to get resource with unregistered type" unless @storage.keys.include?( type )
76
+ @storage[type][resource.cache_key]
77
+ end
78
+
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,2 @@
1
+ require 'tango/resource/buffer'
2
+ require 'tango/resource/cache'
@@ -0,0 +1,3 @@
1
+ module Tango
2
+ VERSION = "0.0.1"
3
+ end
data/lib/tango.rb ADDED
@@ -0,0 +1,18 @@
1
+ # Tango == ETl => true
2
+ require 'nokogiri'
3
+ require 'httparty'
4
+ require 'active_record'
5
+ require 'activerecord-import'
6
+ require 'ar-multidb'
7
+
8
+ require 'yaml'
9
+ require 'logger'
10
+
11
+ require 'tango/app'
12
+ require 'tango/kernel'
13
+ require 'tango/multidb'
14
+ require 'tango/abstract_model'
15
+ require 'tango/link_stack'
16
+ require 'tango/database_locker'
17
+ require 'tango/etl'
18
+ require 'tango/resource'
data/readme.md ADDED
@@ -0,0 +1,3 @@
1
+ # Tango
2
+
3
+ In short: **Tango is ETL framework**. It's job is to scrap content from target website, `extract` necessary data, `transform` it to desire format and `load` it to a database.
data/tango.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ require './lib/tango/version'
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = 'tango-etl'
5
+ s.version = Tango::VERSION
6
+ s.license = 'MIT'
7
+ s.summary = 'ETL framework'
8
+ s.description = 'Scrapping web content made even easier.'
9
+ s.authors = ['Maciej Komorowski']
10
+ s.email = 'mckomo@gmail.com'
11
+ s.files = `git ls-files`.split("\n") - %w[.gitignore]
12
+ s.test_files = s.files.select { |p| p =~ /^test\/*test_.*.rb/ }
13
+ s.homepage = 'https://github.com/mckomo/Tango'
14
+ s.add_dependency 'nokogiri', '~> 1.6', '>= 1.6.1'
15
+ s.add_dependency 'httparty', '~> 0.13', '>= 0.13.1'
16
+ s.add_dependency 'activerecord', '~> 4.1', '>= 4.1.0'
17
+ s.add_dependency 'activerecord-import', '~> 0.5', '>= 0.5.0'
18
+ s.add_dependency 'ar-multidb', '~> 0.1', '>= 0.1.12'
19
+ end
20
+
@@ -0,0 +1,6 @@
1
+ ActiveRecord::Schema.define do
2
+ create_table :users do |t|
3
+ t.string :name
4
+ t.integer :age
5
+ end
6
+ end
@@ -0,0 +1,11 @@
1
+ module Model
2
+
3
+ class User < Tango::AbstractModel
4
+
5
+ def cache_key
6
+ [name, age]
7
+ end
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,18 @@
1
+ class SimpleBuffer
2
+
3
+ attr_reader :register_counter, :fill_couter
4
+
5
+ def initialize
6
+ @register_counter = 0
7
+ @fill_couter = 0
8
+ end
9
+
10
+ def register( type )
11
+ @register_counter += 1
12
+ end
13
+
14
+ def fill( type, resource )
15
+ @fill_couter += 1
16
+ end
17
+
18
+ end
@@ -0,0 +1,18 @@
1
+ class SimpleBuffer
2
+
3
+ attr_reader :register_counter, :fill_couter
4
+
5
+ def initialize
6
+ @register_counter = 0
7
+ @fill_couter = 0
8
+ end
9
+
10
+ def register( type )
11
+ @register_counter += 1
12
+ end
13
+
14
+ def fill( type, resource )
15
+ @fill_couter += 1
16
+ end
17
+
18
+ end
@@ -0,0 +1,22 @@
1
+ require "test/unit"
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+
5
+ require 'tango/resource/cache'
6
+ require_relative '../../support/lib/simple_handler.rb'
7
+
8
+ class TestDispatcher < Test::Unit::TestCase
9
+
10
+ context "a handler dispatcher" do
11
+
12
+ setup do
13
+ @dispatcher
14
+ end
15
+
16
+ should "throw exception when trying to load unregistered type" do
17
+
18
+ end
19
+
20
+ end
21
+
22
+ end
@@ -0,0 +1,51 @@
1
+ require "test/unit"
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+
5
+ require './lib/tango/resource/buffer.rb'
6
+
7
+ class TestBuffer < Test::Unit::TestCase
8
+
9
+ context "a buffer" do
10
+
11
+ setup do
12
+ @buffer_size = 10
13
+ @buffer = Tango::Resource::Buffer.new( @buffer_size )
14
+ end
15
+
16
+ should "throw exception when try register type without callback object with unregistered type" do
17
+ assert_raise ArgumentError do
18
+ @buffer.register( :foo )
19
+ end
20
+ end
21
+
22
+ should "throw exception when try to fill object with unregistered type" do
23
+ assert_raise ArgumentError do
24
+ @buffer.fill( :foo, Object.new )
25
+ end
26
+ end
27
+
28
+ should "give ability to register new type of resource" do
29
+ @buffer.register( :foo ) do; nil; end
30
+ @buffer.fill( :foo, Object.new )
31
+ end
32
+
33
+ should "trigger release callback when buffer size exceeded" do
34
+
35
+ counter = 0
36
+
37
+ @buffer.register( :foo ) do
38
+ counter += 1
39
+ end
40
+
41
+ ( @buffer_size * 5 ).times do
42
+ @buffer.fill( :foo, Object.new )
43
+ end # Full buffer 5 times
44
+
45
+ assert_equal 5, counter
46
+
47
+ end
48
+
49
+ end
50
+
51
+ end
@@ -0,0 +1,120 @@
1
+ require "test/unit"
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+
5
+ require 'tango/resource/cache'
6
+ require_relative '../../support/lib/simple_buffer.rb'
7
+
8
+ class TestCache < Test::Unit::TestCase
9
+
10
+ context "a cache" do
11
+
12
+ setup do
13
+ @buffer = SimpleBuffer.new
14
+ @cache = Tango::Resource::Cache.new( @buffer )
15
+ @foo_resource = stub( cache_key: "foo" )
16
+ end
17
+
18
+ should "throw exception when trying to load unregistered type" do
19
+ assert_raise ArgumentError do
20
+ @cache.load( :foo, "bar" )
21
+ end
22
+ end
23
+
24
+ should "throw exception when trying to load yet not cached resource without callback" do
25
+
26
+ @cache.register( :foo ) do
27
+ nil
28
+ end
29
+
30
+ assert_raise ArgumentError do
31
+ @cache.load( :foo, @foo_resource )
32
+ end
33
+
34
+ end
35
+
36
+ should "be able to store recourse" do
37
+
38
+ @cache.register( :foo ) do
39
+ nil
40
+ end
41
+
42
+ @cache.set( :foo, @foo_resource )
43
+ assert_equal @foo_resource, @cache.get( :foo, @foo_resource )
44
+
45
+ end
46
+
47
+ should "be able to load already stored recourse" do
48
+
49
+ @cache.register( :foo ) do
50
+ nil
51
+ end
52
+
53
+ @cache.set( :foo, @foo_resource )
54
+ assert_equal @foo_resource, @cache.load( :foo, @foo_resource )
55
+
56
+ end
57
+
58
+ should "load yet not cached value using callback" do
59
+
60
+ @cache.register( :foo ) do
61
+ nil
62
+ end
63
+
64
+ loaded_resource = @cache.load( :foo, @foo_resource ) do
65
+ @foo_resource
66
+ end
67
+
68
+ assert_equal @foo_resource, loaded_resource
69
+ assert_equal @foo_resource, @cache.load( :foo, @foo_resource )
70
+
71
+ end
72
+
73
+ should "also register new types with buffer" do
74
+
75
+ 10.times do |i|
76
+ @cache.register( i ) do
77
+ nil
78
+ end
79
+ end
80
+
81
+ assert_equal 10, @buffer.register_counter
82
+
83
+ end
84
+
85
+ should "fill buffer with loaded resources" do
86
+
87
+ @cache.register( :foo ) do
88
+ nil
89
+ end
90
+
91
+ 2.times do |i|
92
+ resouce = stub( cache_key: i )
93
+ loaded_resource = @cache.load( :foo, resouce ) do
94
+ resouce
95
+ end
96
+ end
97
+
98
+ assert_equal 2, @buffer.fill_couter
99
+
100
+ end
101
+
102
+ should "fill buffer only once with same resource" do
103
+
104
+ @cache.register( :foo ) do
105
+ nil
106
+ end
107
+
108
+ 2.times do |i|
109
+ loaded_resource = @cache.load( :foo, @foo_resource ) do
110
+ @foo_resource
111
+ end
112
+ end
113
+
114
+ assert_equal 1, @buffer.fill_couter
115
+
116
+ end
117
+
118
+ end
119
+
120
+ end
@@ -0,0 +1,43 @@
1
+ require 'test/unit'
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+ require 'active_record'
5
+ require 'activerecord-nulldb-adapter'
6
+
7
+ require 'tango/abstract_model'
8
+ require_relative '../support/lib/model/user.rb'
9
+
10
+ class TestAbstractModel < Test::Unit::TestCase
11
+
12
+ context "instance of a model that extends AbstractModel" do
13
+
14
+ setup do
15
+
16
+ ActiveRecord::Base.establish_connection :adapter => :nulldb,
17
+ :schema => Dir.pwd + '/test/support/db/schema.rb'
18
+ @model = Model::User.new
19
+ @model.name = "Maciej"
20
+ @model.age = 22
21
+
22
+ end
23
+
24
+ should "have cache key" do
25
+ assert_equal ["Maciej", 22], @model.cache_key
26
+ end
27
+
28
+ should "have array list with its properties values" do
29
+ assert_equal [nil, "Maciej", 22], @model.values
30
+ end
31
+
32
+ end
33
+
34
+ context "model class that extends AbstractModel" do
35
+
36
+ should "know what is the next available id" do
37
+ assert_equal 1, Model::User.next_id
38
+ assert_equal 2, Model::User.next_id
39
+ end
40
+
41
+ end
42
+
43
+ end
@@ -0,0 +1,32 @@
1
+ require 'test/unit'
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+
5
+ require 'tango/database_locker'
6
+
7
+ class TestDatabaseLocker < Test::Unit::TestCase
8
+
9
+ context "a database locker" do
10
+
11
+ setup do
12
+ @lock_path = './tmp/database_test.lock'
13
+ @locker = Tango::DatabaseLocker.new( ["master", "slave"], @lock_path )
14
+ end
15
+
16
+ teardown do
17
+ File.delete( @lock_path )
18
+ end
19
+
20
+ should "create lock file" do
21
+ @locker.lock( "master" )
22
+ assert File.exists?( @lock_path )
23
+ assert_equal "master", IO.read( @lock_path )
24
+ end
25
+
26
+ should "find unlocked database" do
27
+ @locker.lock( "slave" )
28
+ assert_equal "master", @locker.unlocked
29
+ end
30
+
31
+ end
32
+ end
@@ -0,0 +1,35 @@
1
+ require 'test/unit'
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+
5
+ require 'tango/kernel'
6
+
7
+ class TestKernel < Test::Unit::TestCase
8
+
9
+ context "a kernel" do
10
+
11
+ should "transform file path to name of a class" do
12
+ assert_equal "FooBar", Tango::Kernel.classify( "./../lib/foo_bar.rb" )
13
+ end
14
+
15
+ should "load a class from a file" do
16
+ klass = Tango::Kernel.load( Dir.pwd + '/test/support/lib/simple_buffer.rb' )
17
+ assert_equal SimpleBuffer, klass
18
+ end
19
+
20
+ should "load a class in a module from a file" do
21
+ klass = Tango::Kernel.load( Dir.pwd + '/test/support/lib/model/user.rb', 'Model::' )
22
+ assert_equal Model::User, klass
23
+ end
24
+
25
+ should "obtain symbol from a class" do
26
+ assert_equal :simple_buffer, Tango::Kernel.symbolize( SimpleBuffer )
27
+ end
28
+
29
+ should "obtain symbol from a class in a module" do
30
+ assert_equal :user, Tango::Kernel.symbolize( Model::User )
31
+ end
32
+
33
+ end
34
+
35
+ end
@@ -0,0 +1,49 @@
1
+ require 'test/unit'
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+
5
+ require 'tango/link_stack'
6
+
7
+ class TestLinkStack < Test::Unit::TestCase
8
+
9
+ context "a link stack" do
10
+
11
+ setup do
12
+ @stack = Tango::LinkStack.new( 'http://example.com/data?xml' )
13
+ end
14
+
15
+ should "raise error when initialized with incorrect URL" do
16
+ assert_raise ArgumentError do
17
+ Tango::LinkStack.new( 'ImNotA/Link' )
18
+ end
19
+ end
20
+
21
+ should "know host of base link" do
22
+ assert_equal 'http://example.com:80', @stack.host
23
+ end
24
+
25
+ should "contain initial link path" do
26
+ assert @stack.has_links?
27
+ assert_equal ['/data?xml'], @stack.links
28
+ assert_equal '/data?xml', @stack.shift
29
+ assert ! @stack.has_links?
30
+ end
31
+
32
+ should "store appended links" do
33
+
34
+ @stack.shift # shift initial path
35
+
36
+ @stack.append '/data/bids'
37
+ @stack.append [ '/data/bids/1', '/data/bids/2' ]
38
+ assert_equal 3, @stack.links.count
39
+
40
+ assert_equal '/data/bids', @stack.shift
41
+ assert_equal '/data/bids/1', @stack.shift
42
+ assert_equal '/data/bids/2', @stack.shift
43
+
44
+ assert ! @stack.has_links?
45
+
46
+ end
47
+
48
+ end
49
+ end