tango-etl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,81 @@
1
+ module Tango
2
+ module Resource
3
+
4
+ # Key - value caching system for resources
5
+ #
6
+ # @author Mckomo
7
+ class Cache
8
+
9
+ attr_reader :buffer
10
+
11
+ # Constructor of the cache
12
+ #
13
+ # @param buffer [Tango::Resources::Buffer]
14
+ # @return [Tango::Resources::Cache]
15
+ def initialize( buffer = nil )
16
+ # Set dependencies
17
+ @buffer = buffer || Buffer.new
18
+ # Container for resources cache
19
+ @storage = {}
20
+ end
21
+
22
+ # Register new type of resource to be cached
23
+ #
24
+ # @param type [Symbol]
25
+ # @param release_callback [Proc]
26
+ def register( type, &release_callback )
27
+ # Create container for cache of new resource
28
+ @storage[type] = {}
29
+ # Also register new type with buffer
30
+ @buffer.register( type, &release_callback )
31
+ end
32
+
33
+ # Get a resource or use given block to cache and return it's id
34
+ #
35
+ # @param type [Symbol]
36
+ # @param resource [Object]
37
+ # @return [Integer]
38
+ def load( type, resource )
39
+
40
+ # Get resource from cache
41
+ cached_resource = get( type, resource )
42
+
43
+ unless cached_resource
44
+
45
+ raise ArgumentError, "No resource callback given" unless block_given?
46
+ # If not found, execute yield to receive transformed resource
47
+ cached_resource = yield( resource )
48
+ # Cache new resource
49
+ set( type, cached_resource )
50
+ # Fill buffer with newly cached resource
51
+ @buffer.fill( type, cached_resource )
52
+
53
+ end
54
+
55
+ cached_resource
56
+
57
+ end
58
+
59
+ # Setter for the cache storage
60
+ #
61
+ # @param type [Symbol]
62
+ # @param resource [Object]
63
+ # @return [Object]
64
+ def set( type, resource )
65
+ raise ArgumentError, "Trying to set resource with unregistered type" unless @storage.keys.include?( type )
66
+ @storage[type][resource.cache_key] = resource
67
+ end
68
+
69
+ # Getter for the cache storage
70
+ #
71
+ # @param type [Symbol]
72
+ # @param key [String]
73
+ # @return [Object]
74
+ def get( type, resource )
75
+ raise ArgumentError, "Trying to get resource with unregistered type" unless @storage.keys.include?( type )
76
+ @storage[type][resource.cache_key]
77
+ end
78
+
79
+ end
80
+ end
81
+ end
@@ -0,0 +1,2 @@
1
+ require 'tango/resource/buffer'
2
+ require 'tango/resource/cache'
@@ -0,0 +1,3 @@
1
+ module Tango
2
+ VERSION = "0.0.1"
3
+ end
data/lib/tango.rb ADDED
@@ -0,0 +1,18 @@
1
+ # Tango == ETl => true
2
+ require 'nokogiri'
3
+ require 'httparty'
4
+ require 'active_record'
5
+ require 'activerecord-import'
6
+ require 'ar-multidb'
7
+
8
+ require 'yaml'
9
+ require 'logger'
10
+
11
+ require 'tango/app'
12
+ require 'tango/kernel'
13
+ require 'tango/multidb'
14
+ require 'tango/abstract_model'
15
+ require 'tango/link_stack'
16
+ require 'tango/database_locker'
17
+ require 'tango/etl'
18
+ require 'tango/resource'
data/readme.md ADDED
@@ -0,0 +1,3 @@
1
+ # Tango
2
+
3
+ In short: **Tango is ETL framework**. It's job is to scrap content from target website, `extract` necessary data, `transform` it to desire format and `load` it to a database.
data/tango.gemspec ADDED
@@ -0,0 +1,20 @@
1
+ require './lib/tango/version'
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = 'tango-etl'
5
+ s.version = Tango::VERSION
6
+ s.license = 'MIT'
7
+ s.summary = 'ETL framework'
8
+ s.description = 'Scrapping web content made even easier.'
9
+ s.authors = ['Maciej Komorowski']
10
+ s.email = 'mckomo@gmail.com'
11
+ s.files = `git ls-files`.split("\n") - %w[.gitignore]
12
+ s.test_files = s.files.select { |p| p =~ /^test\/*test_.*.rb/ }
13
+ s.homepage = 'https://github.com/mckomo/Tango'
14
+ s.add_dependency 'nokogiri', '~> 1.6', '>= 1.6.1'
15
+ s.add_dependency 'httparty', '~> 0.13', '>= 0.13.1'
16
+ s.add_dependency 'activerecord', '~> 4.1', '>= 4.1.0'
17
+ s.add_dependency 'activerecord-import', '~> 0.5', '>= 0.5.0'
18
+ s.add_dependency 'ar-multidb', '~> 0.1', '>= 0.1.12'
19
+ end
20
+
@@ -0,0 +1,6 @@
1
+ ActiveRecord::Schema.define do
2
+ create_table :users do |t|
3
+ t.string :name
4
+ t.integer :age
5
+ end
6
+ end
@@ -0,0 +1,11 @@
1
+ module Model
2
+
3
+ class User < Tango::AbstractModel
4
+
5
+ def cache_key
6
+ [name, age]
7
+ end
8
+
9
+ end
10
+
11
+ end
@@ -0,0 +1,18 @@
1
+ class SimpleBuffer
2
+
3
+ attr_reader :register_counter, :fill_couter
4
+
5
+ def initialize
6
+ @register_counter = 0
7
+ @fill_couter = 0
8
+ end
9
+
10
+ def register( type )
11
+ @register_counter += 1
12
+ end
13
+
14
+ def fill( type, resource )
15
+ @fill_couter += 1
16
+ end
17
+
18
+ end
@@ -0,0 +1,18 @@
1
+ class SimpleBuffer
2
+
3
+ attr_reader :register_counter, :fill_couter
4
+
5
+ def initialize
6
+ @register_counter = 0
7
+ @fill_couter = 0
8
+ end
9
+
10
+ def register( type )
11
+ @register_counter += 1
12
+ end
13
+
14
+ def fill( type, resource )
15
+ @fill_couter += 1
16
+ end
17
+
18
+ end
@@ -0,0 +1,22 @@
1
+ require "test/unit"
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+
5
+ require 'tango/resource/cache'
6
+ require_relative '../../support/lib/simple_handler.rb'
7
+
8
+ class TestDispatcher < Test::Unit::TestCase
9
+
10
+ context "a handler dispatcher" do
11
+
12
+ setup do
13
+ @dispatcher
14
+ end
15
+
16
+ should "throw exception when trying to load unregistered type" do
17
+
18
+ end
19
+
20
+ end
21
+
22
+ end
@@ -0,0 +1,51 @@
1
+ require "test/unit"
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+
5
+ require './lib/tango/resource/buffer.rb'
6
+
7
+ class TestBuffer < Test::Unit::TestCase
8
+
9
+ context "a buffer" do
10
+
11
+ setup do
12
+ @buffer_size = 10
13
+ @buffer = Tango::Resource::Buffer.new( @buffer_size )
14
+ end
15
+
16
+ should "throw exception when try register type without callback object with unregistered type" do
17
+ assert_raise ArgumentError do
18
+ @buffer.register( :foo )
19
+ end
20
+ end
21
+
22
+ should "throw exception when try to fill object with unregistered type" do
23
+ assert_raise ArgumentError do
24
+ @buffer.fill( :foo, Object.new )
25
+ end
26
+ end
27
+
28
+ should "give ability to register new type of resource" do
29
+ @buffer.register( :foo ) do; nil; end
30
+ @buffer.fill( :foo, Object.new )
31
+ end
32
+
33
+ should "trigger release callback when buffer size exceeded" do
34
+
35
+ counter = 0
36
+
37
+ @buffer.register( :foo ) do
38
+ counter += 1
39
+ end
40
+
41
+ ( @buffer_size * 5 ).times do
42
+ @buffer.fill( :foo, Object.new )
43
+ end # Full buffer 5 times
44
+
45
+ assert_equal 5, counter
46
+
47
+ end
48
+
49
+ end
50
+
51
+ end
@@ -0,0 +1,120 @@
1
+ require "test/unit"
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+
5
+ require 'tango/resource/cache'
6
+ require_relative '../../support/lib/simple_buffer.rb'
7
+
8
+ class TestCache < Test::Unit::TestCase
9
+
10
+ context "a cache" do
11
+
12
+ setup do
13
+ @buffer = SimpleBuffer.new
14
+ @cache = Tango::Resource::Cache.new( @buffer )
15
+ @foo_resource = stub( cache_key: "foo" )
16
+ end
17
+
18
+ should "throw exception when trying to load unregistered type" do
19
+ assert_raise ArgumentError do
20
+ @cache.load( :foo, "bar" )
21
+ end
22
+ end
23
+
24
+ should "throw exception when trying to load yet not cached resource without callback" do
25
+
26
+ @cache.register( :foo ) do
27
+ nil
28
+ end
29
+
30
+ assert_raise ArgumentError do
31
+ @cache.load( :foo, @foo_resource )
32
+ end
33
+
34
+ end
35
+
36
+ should "be able to store recourse" do
37
+
38
+ @cache.register( :foo ) do
39
+ nil
40
+ end
41
+
42
+ @cache.set( :foo, @foo_resource )
43
+ assert_equal @foo_resource, @cache.get( :foo, @foo_resource )
44
+
45
+ end
46
+
47
+ should "be able to load already stored recourse" do
48
+
49
+ @cache.register( :foo ) do
50
+ nil
51
+ end
52
+
53
+ @cache.set( :foo, @foo_resource )
54
+ assert_equal @foo_resource, @cache.load( :foo, @foo_resource )
55
+
56
+ end
57
+
58
+ should "load yet not cached value using callback" do
59
+
60
+ @cache.register( :foo ) do
61
+ nil
62
+ end
63
+
64
+ loaded_resource = @cache.load( :foo, @foo_resource ) do
65
+ @foo_resource
66
+ end
67
+
68
+ assert_equal @foo_resource, loaded_resource
69
+ assert_equal @foo_resource, @cache.load( :foo, @foo_resource )
70
+
71
+ end
72
+
73
+ should "also register new types with buffer" do
74
+
75
+ 10.times do |i|
76
+ @cache.register( i ) do
77
+ nil
78
+ end
79
+ end
80
+
81
+ assert_equal 10, @buffer.register_counter
82
+
83
+ end
84
+
85
+ should "fill buffer with loaded resources" do
86
+
87
+ @cache.register( :foo ) do
88
+ nil
89
+ end
90
+
91
+ 2.times do |i|
92
+ resouce = stub( cache_key: i )
93
+ loaded_resource = @cache.load( :foo, resouce ) do
94
+ resouce
95
+ end
96
+ end
97
+
98
+ assert_equal 2, @buffer.fill_couter
99
+
100
+ end
101
+
102
+ should "fill buffer only once with same resource" do
103
+
104
+ @cache.register( :foo ) do
105
+ nil
106
+ end
107
+
108
+ 2.times do |i|
109
+ loaded_resource = @cache.load( :foo, @foo_resource ) do
110
+ @foo_resource
111
+ end
112
+ end
113
+
114
+ assert_equal 1, @buffer.fill_couter
115
+
116
+ end
117
+
118
+ end
119
+
120
+ end
@@ -0,0 +1,43 @@
1
+ require 'test/unit'
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+ require 'active_record'
5
+ require 'activerecord-nulldb-adapter'
6
+
7
+ require 'tango/abstract_model'
8
+ require_relative '../support/lib/model/user.rb'
9
+
10
+ class TestAbstractModel < Test::Unit::TestCase
11
+
12
+ context "instance of a model that extends AbstractModel" do
13
+
14
+ setup do
15
+
16
+ ActiveRecord::Base.establish_connection :adapter => :nulldb,
17
+ :schema => Dir.pwd + '/test/support/db/schema.rb'
18
+ @model = Model::User.new
19
+ @model.name = "Maciej"
20
+ @model.age = 22
21
+
22
+ end
23
+
24
+ should "have cache key" do
25
+ assert_equal ["Maciej", 22], @model.cache_key
26
+ end
27
+
28
+ should "have array list with its properties values" do
29
+ assert_equal [nil, "Maciej", 22], @model.values
30
+ end
31
+
32
+ end
33
+
34
+ context "model class that extends AbstractModel" do
35
+
36
+ should "know what is the next available id" do
37
+ assert_equal 1, Model::User.next_id
38
+ assert_equal 2, Model::User.next_id
39
+ end
40
+
41
+ end
42
+
43
+ end
@@ -0,0 +1,32 @@
1
+ require 'test/unit'
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+
5
+ require 'tango/database_locker'
6
+
7
+ class TestDatabaseLocker < Test::Unit::TestCase
8
+
9
+ context "a database locker" do
10
+
11
+ setup do
12
+ @lock_path = './tmp/database_test.lock'
13
+ @locker = Tango::DatabaseLocker.new( ["master", "slave"], @lock_path )
14
+ end
15
+
16
+ teardown do
17
+ File.delete( @lock_path )
18
+ end
19
+
20
+ should "create lock file" do
21
+ @locker.lock( "master" )
22
+ assert File.exists?( @lock_path )
23
+ assert_equal "master", IO.read( @lock_path )
24
+ end
25
+
26
+ should "find unlocked database" do
27
+ @locker.lock( "slave" )
28
+ assert_equal "master", @locker.unlocked
29
+ end
30
+
31
+ end
32
+ end
@@ -0,0 +1,35 @@
1
+ require 'test/unit'
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+
5
+ require 'tango/kernel'
6
+
7
+ class TestKernel < Test::Unit::TestCase
8
+
9
+ context "a kernel" do
10
+
11
+ should "transform file path to name of a class" do
12
+ assert_equal "FooBar", Tango::Kernel.classify( "./../lib/foo_bar.rb" )
13
+ end
14
+
15
+ should "load a class from a file" do
16
+ klass = Tango::Kernel.load( Dir.pwd + '/test/support/lib/simple_buffer.rb' )
17
+ assert_equal SimpleBuffer, klass
18
+ end
19
+
20
+ should "load a class in a module from a file" do
21
+ klass = Tango::Kernel.load( Dir.pwd + '/test/support/lib/model/user.rb', 'Model::' )
22
+ assert_equal Model::User, klass
23
+ end
24
+
25
+ should "obtain symbol from a class" do
26
+ assert_equal :simple_buffer, Tango::Kernel.symbolize( SimpleBuffer )
27
+ end
28
+
29
+ should "obtain symbol from a class in a module" do
30
+ assert_equal :user, Tango::Kernel.symbolize( Model::User )
31
+ end
32
+
33
+ end
34
+
35
+ end
@@ -0,0 +1,49 @@
1
+ require 'test/unit'
2
+ require 'shoulda'
3
+ require 'mocha/setup'
4
+
5
+ require 'tango/link_stack'
6
+
7
+ class TestLinkStack < Test::Unit::TestCase
8
+
9
+ context "a link stack" do
10
+
11
+ setup do
12
+ @stack = Tango::LinkStack.new( 'http://example.com/data?xml' )
13
+ end
14
+
15
+ should "raise error when initialized with incorrect URL" do
16
+ assert_raise ArgumentError do
17
+ Tango::LinkStack.new( 'ImNotA/Link' )
18
+ end
19
+ end
20
+
21
+ should "know host of base link" do
22
+ assert_equal 'http://example.com:80', @stack.host
23
+ end
24
+
25
+ should "contain initial link path" do
26
+ assert @stack.has_links?
27
+ assert_equal ['/data?xml'], @stack.links
28
+ assert_equal '/data?xml', @stack.shift
29
+ assert ! @stack.has_links?
30
+ end
31
+
32
+ should "store appended links" do
33
+
34
+ @stack.shift # shift initial path
35
+
36
+ @stack.append '/data/bids'
37
+ @stack.append [ '/data/bids/1', '/data/bids/2' ]
38
+ assert_equal 3, @stack.links.count
39
+
40
+ assert_equal '/data/bids', @stack.shift
41
+ assert_equal '/data/bids/1', @stack.shift
42
+ assert_equal '/data/bids/2', @stack.shift
43
+
44
+ assert ! @stack.has_links?
45
+
46
+ end
47
+
48
+ end
49
+ end