rpareia 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8fb7c8a13b2133209c080eaf3eba626412e71659
4
+ data.tar.gz: e3acad44a34d0ba4d74b746aa82fb09cce9ddf64
5
+ SHA512:
6
+ metadata.gz: 0ab1079409ef2541c80d838925b59b8f8f11d259e993248c326e9f451091906184299870bc14c9364be02b68175f8a85c4b66091a4d81d2bc4a692efbad57d9b
7
+ data.tar.gz: 1a5182f13b2f71fe6ecac5cfe07b6488b789da43c583b9e5245ab84a1e40dec67254a2721f46240faee4ef8addb70974e2c86e3537b99b6b24f8c48784cdf6b0
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --require spec_helper
3
+ --format doc
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - "2.1.0"
4
+ - "2.2.0"
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in rpareia.gemspec
4
+ gemspec
@@ -0,0 +1,84 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ rpareia (0.0.1)
5
+ gli (~> 2.12)
6
+ nokogiri (~> 1.6.6)
7
+ rbczmq (~> 1.7)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ celluloid (0.16.0)
13
+ timers (~> 4.0.0)
14
+ coderay (1.1.0)
15
+ diff-lcs (1.2.5)
16
+ ffi (1.9.6)
17
+ formatador (0.2.5)
18
+ gli (2.12.2)
19
+ guard (2.11.1)
20
+ formatador (>= 0.2.4)
21
+ listen (~> 2.7)
22
+ lumberjack (~> 1.0)
23
+ nenv (~> 0.1)
24
+ notiffany (~> 0.0)
25
+ pry (>= 0.9.12)
26
+ shellany (~> 0.0)
27
+ thor (>= 0.18.1)
28
+ guard-compat (1.2.1)
29
+ guard-rspec (4.5.0)
30
+ guard (~> 2.1)
31
+ guard-compat (~> 1.1)
32
+ rspec (>= 2.99.0, < 4.0)
33
+ hitimes (1.2.2)
34
+ listen (2.8.5)
35
+ celluloid (>= 0.15.2)
36
+ rb-fsevent (>= 0.9.3)
37
+ rb-inotify (>= 0.9)
38
+ lumberjack (1.0.9)
39
+ method_source (0.8.2)
40
+ mini_portile (0.6.2)
41
+ nenv (0.2.0)
42
+ nokogiri (1.6.6.2)
43
+ mini_portile (~> 0.6.0)
44
+ notiffany (0.0.3)
45
+ nenv (~> 0.1)
46
+ shellany (~> 0.0)
47
+ pry (0.10.1)
48
+ coderay (~> 1.1.0)
49
+ method_source (~> 0.8.1)
50
+ slop (~> 3.4)
51
+ rake (10.4.2)
52
+ rb-fsevent (0.9.4)
53
+ rb-inotify (0.9.5)
54
+ ffi (>= 0.5.0)
55
+ rbczmq (1.7.8)
56
+ rspec (3.2.0)
57
+ rspec-core (~> 3.2.0)
58
+ rspec-expectations (~> 3.2.0)
59
+ rspec-mocks (~> 3.2.0)
60
+ rspec-core (3.2.0)
61
+ rspec-support (~> 3.2.0)
62
+ rspec-expectations (3.2.0)
63
+ diff-lcs (>= 1.2.0, < 2.0)
64
+ rspec-support (~> 3.2.0)
65
+ rspec-mocks (3.2.0)
66
+ diff-lcs (>= 1.2.0, < 2.0)
67
+ rspec-support (~> 3.2.0)
68
+ rspec-support (3.2.1)
69
+ shellany (0.0.1)
70
+ slop (3.6.0)
71
+ thor (0.19.1)
72
+ timers (4.0.1)
73
+ hitimes
74
+
75
+ PLATFORMS
76
+ ruby
77
+
78
+ DEPENDENCIES
79
+ bundler (~> 1.7)
80
+ guard-rspec (~> 4.5)
81
+ pry (~> 0.10)
82
+ rake (~> 10.0)
83
+ rpareia!
84
+ rspec (~> 3.2)
@@ -0,0 +1,5 @@
1
+ guard :rspec, cmd: "bundle exec rspec" do
2
+ watch(%r{^spec/(.+)/.+_spec\.rb$})
3
+ watch(%r{^lib/(.+)/(.+)\.rb$}) { |m| "spec/#{m[1]}/#{m[2]}_spec.rb" }
4
+ watch('spec/spec_helper.rb') { "spec" }
5
+ end
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Michel Boaventura
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,24 @@
1
+ [![CodeClimate](https://codeclimate.com/github/michelboaventura/rpareia/badges/gpa.svg)](https://codeclimate.com/github/michelboaventura/rpareia)
2
+ [![Travis](https://travis-ci.org/michelboaventura/rpareia.svg)](https://travis-ci.org/michelboaventura/rpareia.svg)
3
+
4
+ # Rpareia
5
+
6
+ Pareia's[1] implementation in Ruby with zeromq:
7
+
8
+ [1] https://github.com/michelboaventura/pareia
9
+
10
+ ## Installation
11
+
12
+ $ gem install rpareia
13
+
14
+ ## Usage
15
+
16
+ TODO: Write usage instructions here
17
+
18
+ ## Contributing
19
+
20
+ 1. Fork it ( https://github.com/michelboaventura/rpareia/fork )
21
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
22
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
23
+ 4. Push to the branch (`git push origin my-new-feature`)
24
+ 5. Create a new Pull Request
@@ -0,0 +1,12 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ begin
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task :default => :spec
9
+ rescue LoadError
10
+ # no rspec available
11
+ # end
12
+ end
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'gli'
4
+ require 'rpareia'
5
+
6
+ include GLI::App
7
+
8
+ program_desc "A data deduplication software"
9
+
10
+ desc "Handles with project files"
11
+ command :project do |c|
12
+ c.desc 'Check a project xml for errors'
13
+ c.flag [:c,:check]
14
+
15
+ c.action do |global_options,options,args|
16
+ if project = options[:c]
17
+ Rpareia::Parser.new(File.open(project,'r').read)
18
+ puts "Project file '#{project}' is OK"
19
+ end
20
+ end
21
+ end
22
+
23
+ exit run(ARGV)
@@ -0,0 +1,6 @@
1
+ require "rpareia/version"
2
+ require "rpareia/parser"
3
+
4
+ module Rpareia
5
+ # Your code goes here...
6
+ end
@@ -0,0 +1,52 @@
1
+ require_relative 'heartbeat'
2
+ require 'rbczmq'
3
+ require 'pry'
4
+
5
+ class Blocker
6
+
7
+ BLOCKER_PORT_START = 2000
8
+ CTX = ZMQ::Context.new
9
+
10
+ def initialize(rank:, debug: false)
11
+ @rank = rank
12
+ @debug = debug
13
+ @port = BLOCKER_PORT_START + @rank
14
+ end
15
+
16
+ def start
17
+ start_hb
18
+ connect
19
+
20
+ total = 0
21
+
22
+ loop do
23
+ msg = @socket.recv
24
+
25
+ next if msg.nil? || msg[0] == "\x00"
26
+
27
+ if msg != "EOF"
28
+ total += 1
29
+ else
30
+ puts "TOTAL=#{total}"
31
+ end
32
+ end
33
+
34
+ end
35
+
36
+ def start_hb
37
+ Thread.new do
38
+ loop do
39
+ Heartbeat.check(addr: "tcp://*:#{2 * @port}", ctx: CTX, type: 'server', debug: @debug)
40
+ end
41
+ end
42
+ end
43
+
44
+ def connect
45
+ @socket = CTX.socket(:ROUTER)
46
+ @socket.verbose = @debug
47
+ @socket.bind("tcp://*:#{@port}")
48
+
49
+ end
50
+ end
51
+
52
+ Blocker.new(rank: ARGV[0].to_i).start
@@ -0,0 +1,17 @@
1
+ class Heartbeat
2
+ def self.check(addr:, ctx:, type:, debug: false)
3
+ if(type == 'client')
4
+ hb = ctx.socket(:REQ)
5
+ hb.verbose = debug
6
+ hb.connect(addr)
7
+ hb.send("")
8
+ hb.close
9
+ else
10
+ hb = ctx.socket(:REP)
11
+ hb.verbose = debug
12
+ hb.bind(addr)
13
+ hb.recv
14
+ hb.close
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,171 @@
1
+ require "bundler/setup"
2
+ require "nokogiri"
3
+ require "pry"
4
+
5
+ module Rpareia
6
+ class Parser
7
+ attr_reader :project
8
+
9
+ class SyntaxError < StandardError; end
10
+ class InvalidTaskError < StandardError; end
11
+ class InvalidNumberOfSources < StandardError; end
12
+ class DuplicatedDataSourceId < StandardError; end
13
+ class MissingDataSourceId < StandardError; end
14
+ class MissingDataSourceFile < StandardError; end
15
+ class InvalidDataSourceFile < StandardError; end
16
+ class InvalidDataSourceType < StandardError; end
17
+ class MissingFieldSeparator < StandardError; end
18
+ class FieldsElementNotFound < StandardError; end
19
+ class MissingFieldName < StandardError; end
20
+ class InvalidFieldType < StandardError; end
21
+ class DuplicatedFieldName < StandardError; end
22
+ class DeterministicLinkageElementNotFound < StandardError; end
23
+ class ConjunctionElementNotFound < StandardError; end
24
+ class MultipleConjunctionElements < StandardError; end
25
+ class MissingPart < StandardError; end
26
+ class MissingFieldName < StandardError; end
27
+ class MissingPartFieldNameOnDataSource < StandardError; end
28
+ class MissingOutputElement < StandardError; end
29
+ class MultipleOutputElement < StandardError; end
30
+ class MissingDeterministicAttribute < StandardError; end
31
+
32
+ def initialize(xml)
33
+ @xml = xml
34
+
35
+ parse
36
+ end
37
+
38
+ private
39
+
40
+ def parse_xml
41
+ @xml = Nokogiri::XML(@xml)
42
+
43
+ raise SyntaxError.exception(@xml.errors.join("\n")) unless @xml.errors.empty?
44
+ end
45
+
46
+ def parse_task
47
+ @project = {name: @xml.xpath("/project/@task").first.value}
48
+ end
49
+
50
+ def find_duplicated(arr)
51
+ arr.detect {|e| arr.rindex(e) != arr.index(e) }
52
+ end
53
+
54
+ def parse_data_sources
55
+ @project[:data_sources] = []
56
+
57
+ @xml.xpath("/project/data-sources/data-source").each do |data_source|
58
+ id = data_source['id'].to_s
59
+ raise MissingDataSourceId.exception if id.empty?
60
+
61
+ file = data_source['file'].to_s
62
+ raise MissingDataSourceFile.exception("Missing file attribute from data source '#{id}'") if file.empty?
63
+ raise InvalidDataSourceFile.exception("File '#{file}' from data source '#{id}' does not exist") unless File.exist?(file)
64
+
65
+ type = data_source['type'].to_s
66
+ raise InvalidDataSourceType.exception("Data source type '#{type}' not supported") if type != "delimited"
67
+
68
+ field_separator = data_source['field-separator'].to_s
69
+ raise MissingFieldSeparator.exception("Missing field separator from data source '#{id}'") if field_separator.empty?
70
+
71
+ fields = data_source.xpath("fields/field")
72
+ raise FieldsElementNotFound.exception("Element 'field' not found on data source '#{id}'") if fields.empty?
73
+
74
+ my_fields = []
75
+ fields.each do |field|
76
+ name = field['name']
77
+ raise MissingFieldName.exception("Attribute 'name' not found on data source '#{id}'") if name.empty?
78
+
79
+ type = field['type']
80
+ raise InvalidFieldType.exception("Invalid type '#{type}' from field '#{name}', data source '#{id}'") unless ['int', 'string'].include? type
81
+ my_fields << {name: name, type: type}
82
+ end
83
+
84
+ if dup = find_duplicated(my_fields.map{|f| f[:name]})
85
+ raise DuplicatedFieldName.exception("Duplicated field name '#{dup}' on data source '#{id}'")
86
+ end
87
+
88
+ @project[:data_sources] << {
89
+ id: id,
90
+ file: file,
91
+ fields: my_fields
92
+ }
93
+ end
94
+
95
+ if dup = find_duplicated(@project[:data_sources].map{|el| el[:id]})
96
+ raise DuplicatedDataSourceId.exception("Duplicated data source id '#{dup}'")
97
+ end
98
+
99
+ case @project[:name]
100
+ when 'linkage'
101
+ if @project[:data_sources].size != 2
102
+ raise InvalidNumberOfSources.exception("Linkage: expected two data-source, #{@project[:data_sources].size} given")
103
+ end
104
+ when 'deduplication'
105
+ if @project[:data_sources].size != 1
106
+ raise InvalidNumberOfSources.exception("Deduplication: expected one data-sources, #{@project[:data_sources].size} given")
107
+ end
108
+ else
109
+ raise InvalidTaskError.exception("Invalid task: '#{@project[:name]}'")
110
+ end
111
+ end
112
+
113
+ def parse_deterministic_linkage
114
+ deterministic_linkage = @xml.xpath("/project/deterministic-linkage")
115
+ raise DeterministicLinkageElementNotFound.exception("Missing deterministic-linkage element") if deterministic_linkage.empty?
116
+
117
+ conjunction = deterministic_linkage.xpath("conjunction")
118
+ size = conjunction.size
119
+
120
+ case size
121
+ when 0
122
+ raise ConjunctionElementNotFound.exception("Missing conjunction element")
123
+ when 1
124
+ else
125
+ raise MultipleConjunctionElements.exception("Only one conjunction element is allowed, #{size} found")
126
+ end
127
+
128
+ parts = conjunction.xpath("part")
129
+ raise MissingPart.exception("At leas one part element is required") if parts.empty?
130
+
131
+ @project[:parts] = []
132
+ parts.each do |part|
133
+ field_name = part['field-name'].to_s
134
+ raise MissingFieldName.exception("Missing attribute field-name on part element") if field_name.empty?
135
+
136
+ @project[:parts] << {field_name: field_name}
137
+ end
138
+
139
+ @project[:parts].each do |part|
140
+ @project[:data_sources].each do |data_source|
141
+ unless data_source[:fields].map{|e| e[:name]}.include?(part[:field_name])
142
+ raise MissingPartFieldNameOnDataSource.exception("Field name '#{part[:field_name]}' not found on data source '#{data_source[:id]}'")
143
+ end
144
+ end
145
+ end
146
+ end
147
+
148
+ def parse_output
149
+ output = @xml.xpath("/project/output")
150
+
151
+ case size = output.size
152
+ when 0
153
+ raise MissingOutputElement.exception("Missing output element")
154
+ when 1
155
+ else
156
+ raise MultipleOutputElement.exception("Only one output element is allowed, #{size} found")
157
+ end
158
+
159
+ @project[:output] = output.first['deterministic']
160
+ raise MissingDeterministicAttribute.exception("Missing attribute 'deterministic' on output element") unless @project[:output]
161
+ end
162
+
163
+ def parse
164
+ parse_xml
165
+ parse_task
166
+ parse_data_sources
167
+ parse_deterministic_linkage
168
+ parse_output
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,91 @@
1
+ require 'bundler/setup'
2
+ require 'rbczmq'
3
+ require_relative 'heartbeat'
4
+ require 'pry'
5
+
6
+ class Reader
7
+ BLOCKER_PORT_START = 2000
8
+ CTX = ZMQ::Context.new
9
+
10
+ def initialize(num_readers:, num_blocks:, rank:, file:, debug: false, addr: "tcp://localhost")
11
+ @num_readers = num_readers
12
+ @num_blocks = num_blocks
13
+ @rank = rank
14
+ @file = file
15
+ @debug = debug
16
+ @addr = addr
17
+ end
18
+
19
+ def start
20
+ start_connections
21
+ read_database
22
+ end
23
+
24
+ def start_connections
25
+
26
+ @connections = []
27
+
28
+ 0.upto(@num_blocks - 1) do |i|
29
+ port = BLOCKER_PORT_START + i
30
+ Heartbeat.check(addr: "#{@addr}:#{2 * port}", ctx: CTX, type: 'client', debug: @debug)
31
+
32
+ @connections << create_socket(port)
33
+ end
34
+ end
35
+
36
+ def create_socket(port)
37
+ socket = CTX.socket(:DEALER)
38
+ socket.verbose = @debug
39
+ socket.connect("#{@addr}:#{port}")
40
+ return socket
41
+ end
42
+
43
+ def read_database
44
+ i = -1
45
+ File.open(@file,'r').each_line do |line|
46
+ i += 1
47
+ next if i % @num_readers != @rank
48
+
49
+ pair = create_pair(line.strip)
50
+ send_pair(pair)
51
+ end
52
+ close_connections
53
+ puts "EOF"
54
+ end
55
+
56
+ def close_connections
57
+ @connections.each do |c|
58
+ c.send("EOF")
59
+ c.close
60
+ end
61
+ CTX.destroy
62
+ end
63
+
64
+ def send_pair(pair)
65
+ block_id = which_block(pair.last)
66
+
67
+ @connections[block_id].send(pair.join(','))
68
+ end
69
+
70
+ def create_hash(input)
71
+ input.join
72
+ end
73
+
74
+ def which_block(key)
75
+ key.bytes.first % @num_blocks
76
+ end
77
+
78
+ def create_pair(input)
79
+ line = input.split(',')
80
+ [line.first, create_hash(line[1..-1])]
81
+ end
82
+ end
83
+
84
+ Reader.new(
85
+ debug: false,
86
+ num_readers: 2,
87
+ num_blocks: 1,
88
+ rank: ARGV[0].to_i,
89
+ file: (ARGV[1] || "input/10m.csv"),
90
+ addr: (ARGV[2] || "tcp://localhost")
91
+ ).start
@@ -0,0 +1,3 @@
1
+ module Rpareia
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,28 @@
1
+ require File.join([File.dirname(__FILE__),'lib','rpareia','version.rb'])
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "rpareia"
5
+ s.description = "A data deduplication software, based on Pareia"
6
+ s.version = Rpareia::VERSION
7
+ s.authors = ["Michel Boaventura"]
8
+ s.email = ["michel.boaventura@gmail.com"]
9
+ s.homepage = "http://github.com/michelboaventura/rpareia"
10
+ s.platform = Gem::Platform::RUBY
11
+ s.summary = %q{Pareia implementation, with ruby and ZeroMQ}
12
+ s.files = `git ls-files -z`.split("\x0")
13
+ s.license = "MIT"
14
+ s.require_paths = ["lib"]
15
+ s.bindir = 'bin'
16
+ s.executables = ['rpareia']
17
+ s.test_files = s.files.grep(%r{^(spec)/})
18
+
19
+ s.add_development_dependency "bundler", "~> 1.7"
20
+ s.add_development_dependency "rake", "~> 10.0"
21
+ s.add_development_dependency "pry", "~> 0.10"
22
+ s.add_development_dependency "rspec", "~> 3.2"
23
+ s.add_development_dependency "guard-rspec", "~> 4.5"
24
+
25
+ s.add_dependency "nokogiri", "~> 1.6"
26
+ s.add_dependency "rbczmq", "~> 1.7"
27
+ s.add_dependency "gli", "~> 2.12"
28
+ end
File without changes
@@ -0,0 +1,215 @@
1
+ include Rpareia
2
+
3
+ DEFAULT_FILE = "spec/fixtures/empty_file.xml"
4
+
5
+ def xml_dedup(task: 'deduplication', ids: [0], files: [DEFAULT_FILE], types: ['delimited'], field_separators: ["\t"], fields: [[{name: 'id', type: 'int'}]], parts: [{:'field-name' => 'id'}], deterministic: true, conjunction: 1, output_deterministic: ['foo'])
6
+ xml(task: task, ids: ids, files: files, types: types, field_separators: field_separators, fields: fields, parts: parts, deterministic: deterministic, conjunction: conjunction, output_deterministic: output_deterministic)
7
+ end
8
+
9
+ def xml_linkage(task: 'linkage', ids: [0,1], files: [DEFAULT_FILE, DEFAULT_FILE], types: ['delimited', 'delimited'], field_separators: ["\t", "\t"], fields: [[{name: 'id', type: 'int'}], [{name: 'id', type: 'int'}]], parts: [{:'field-name' => 'id'}], deterministic: true, conjunction: 1, output_deterministic: ['foo'])
10
+
11
+ xml(task: task, ids: ids, files: files, types: types, field_separators: field_separators, fields: fields, parts: parts, deterministic: deterministic, conjunction: conjunction, output_deterministic: output_deterministic)
12
+ end
13
+
14
+ def xml(task:, ids: [], files: nil, types: nil, field_separators: nil, fields: nil, parts: nil, deterministic: true, conjunction: 1, output_deterministic: ['foo'])
15
+ Nokogiri::XML::Builder.new do |xml|
16
+ xml.project(task: task) {
17
+ xml.send('data-sources') {
18
+ ids.each_with_index do |id,i|
19
+ xml.send('data-source', id: id, file: files[i], type: types[i], :'field-separator' => field_separators[i]) {
20
+ xml.fields {
21
+ next unless fields[i]
22
+ fields[i].each do |field|
23
+ xml.field(field)
24
+ end
25
+ }
26
+ }
27
+ end
28
+ }
29
+ if deterministic
30
+ xml.send('deterministic-linkage') {
31
+ conjunction.times do
32
+ xml.conjunction {
33
+ parts.each do |part|
34
+ xml.part(part)
35
+ end
36
+ }
37
+ end
38
+ }
39
+ end
40
+ if output_deterministic
41
+ if output_deterministic.empty?
42
+ xml.output
43
+ else
44
+ output_deterministic.each do |out|
45
+ xml.output(deterministic: out)
46
+ end
47
+ end
48
+ end
49
+ }
50
+ end.to_xml
51
+ end
52
+
53
+ RSpec.describe Parser do
54
+ context "initialization" do
55
+ it "expects a xml string as argument" do
56
+ expect {Parser.new}.to raise_error(ArgumentError)
57
+ end
58
+ it "complains about invalid XML" do
59
+ bad_xml = '<?xml version="1.0 encoding="UTF-8"?>'
60
+ expect {Parser.new(bad_xml)}.to raise_error(Parser::SyntaxError)
61
+ end
62
+ end
63
+
64
+ context "tasks" do
65
+ it "accepts only valid tasks" do
66
+ xml = xml_dedup
67
+ expect {Parser.new(xml)}.to_not raise_error
68
+
69
+ xml = xml_linkage
70
+ expect {Parser.new(xml)}.to_not raise_error
71
+ end
72
+
73
+ it "rejects invalid tasks" do
74
+ xml = xml_dedup(task: 'foo')
75
+ expect {Parser.new(xml)}.to raise_error(Parser::InvalidTaskError, "Invalid task: 'foo'")
76
+ end
77
+ end
78
+
79
+ context "data source" do
80
+ it "accepts only one data sources when deduplicating" do
81
+ xml = xml_linkage(task: 'deduplication')
82
+ expect {Parser.new(xml)}.to raise_error(Parser::InvalidNumberOfSources, "Deduplication: expected one data-sources, 2 given")
83
+ end
84
+
85
+ it "accepts only two data sources when linking" do
86
+ xml = xml_dedup(task: 'linkage')
87
+ expect {Parser.new(xml)}.to raise_error(Parser::InvalidNumberOfSources, "Linkage: expected two data-source, 1 given")
88
+ end
89
+
90
+ it "rejects data sources without id" do
91
+ xml = xml_dedup(ids: [nil])
92
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingDataSourceId)
93
+ end
94
+
95
+ it "rejects two data sources with same id" do
96
+ xml = xml_linkage(ids: [1,1])
97
+ expect {Parser.new(xml)}.to raise_error(Parser::DuplicatedDataSourceId, "Duplicated data source id '1'")
98
+ end
99
+
100
+ it "expects a file name" do
101
+ xml = xml_dedup(files: [''])
102
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingDataSourceFile, "Missing file attribute from data source '0'")
103
+ end
104
+
105
+ it "rejects invalid file names" do
106
+ xml = xml_dedup(files: ['invalid'])
107
+ expect {Parser.new(xml)}.to raise_error(Parser::InvalidDataSourceFile, "File 'invalid' from data source '0' does not exist")
108
+ end
109
+
110
+ it "accepts only 'delimiter' type" do
111
+ xml = xml_dedup(types: ['delimited'])
112
+ expect {Parser.new(xml)}.to_not raise_error
113
+
114
+ xml = xml_dedup(types: ['foo'])
115
+ expect {Parser.new(xml)}.to raise_error(Parser::InvalidDataSourceType, "Data source type 'foo' not supported")
116
+ end
117
+
118
+ it "expects a field-separator" do
119
+ xml = xml_dedup(field_separators: ["\t"])
120
+ expect {Parser.new(xml)}.to_not raise_error
121
+
122
+ xml = xml_dedup(field_separators: [""])
123
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingFieldSeparator, "Missing field separator from data source '0'")
124
+ end
125
+
126
+ it "expects a 'fields' entry" do
127
+ xml = xml_dedup(fields: [[{name: 'id', type: 'int'}]], parts: [{:'field-name' => 'id'}])
128
+ expect {Parser.new(xml)}.to_not raise_error
129
+
130
+ xml = xml_dedup(fields: [])
131
+ expect {Parser.new(xml)}.to raise_error(Parser::FieldsElementNotFound)
132
+ end
133
+
134
+ it "expect a name attribute on field element" do
135
+ xml = xml_dedup(fields: [[{name: '', type: 'int'}]])
136
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingFieldName)
137
+ end
138
+
139
+ it "rejects duplicated names on the same data source" do
140
+ xml = xml_dedup(fields: [[{name: 'foo', type: 'int'},{name: 'foo', type: 'int'}]])
141
+ expect {Parser.new(xml)}.to raise_error(Parser::DuplicatedFieldName, "Duplicated field name 'foo' on data source '0'")
142
+ end
143
+
144
+ it "expect a valid field type" do
145
+ ['int', 'string'].each do |type|
146
+ xml = xml_dedup(fields: [[{name: 'foo', type: type}]], parts: [{:'field-name' => 'foo'}])
147
+ expect {Parser.new(xml)}.to_not raise_error
148
+ end
149
+
150
+ xml = xml_dedup(fields: [[{name: 'foo', type: 'bar'}]])
151
+ expect {Parser.new(xml)}.to raise_error(Parser::InvalidFieldType, "Invalid type 'bar' from field 'foo', data source '0'")
152
+ end
153
+ end
154
+
155
+ context "deterministic" do
156
+ it "expects a deterministic-linkage element" do
157
+ xml = xml_dedup(deterministic: false)
158
+ expect {Parser.new(xml)}.to raise_error(Parser::DeterministicLinkageElementNotFound, "Missing deterministic-linkage element")
159
+ end
160
+
161
+ it "expects a single conjunction element" do
162
+ xml = xml_dedup(conjunction: 0)
163
+ expect {Parser.new(xml)}.to raise_error(Parser::ConjunctionElementNotFound, "Missing conjunction element")
164
+
165
+ xml = xml_dedup(conjunction: 1)
166
+ expect {Parser.new(xml)}.to_not raise_error
167
+
168
+ conjunction = 2
169
+ xml = xml_dedup(conjunction: conjunction)
170
+ expect {Parser.new(xml)}.to raise_error(Parser::MultipleConjunctionElements, "Only one conjunction element is allowed, #{conjunction} found")
171
+ end
172
+
173
+ it "expects at leas one part element" do
174
+ xml = xml_dedup(fields: [[{name: 'foo', type: 'int'}]], parts: [{:'field-name' => 'foo'}])
175
+ expect {Parser.new(xml)}.to_not raise_error
176
+
177
+ xml = xml_dedup(parts: [])
178
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingPart, "At leas one part element is required")
179
+ end
180
+
181
+ it "expects a field-name on part elements" do
182
+ xml = xml_dedup(fields: [[{name: 'foo', type: 'string'}]], parts: [{:'field-name' => 'foo'}])
183
+ expect {Parser.new(xml)}.to_not raise_error
184
+
185
+ xml = xml_dedup(parts: [{not_field_name: 'foo'}])
186
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingFieldName, "Missing attribute field-name on part element")
187
+ end
188
+
189
+ it "accepts only field-names present on all data sources" do
190
+ xml = xml_linkage(fields: [[{name: 'foo', type: 'string'}],[{name: 'foo', type: 'string'}]], parts: [{:'field-name' => 'foo'}])
191
+ expect {Parser.new(xml)}.to_not raise_error
192
+
193
+ xml = xml_linkage(fields: [[{name: 'foo', type: 'string'}],[{name: 'bar', type: 'string'}]], parts: [{:'field-name' => 'foo'}])
194
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingPartFieldNameOnDataSource, "Field name 'foo' not found on data source '1'")
195
+ end
196
+ end
197
+
198
+ context "output" do
199
+ it "expects a single 'output' element" do
200
+ xml = xml_dedup(output_deterministic: ["foo"])
201
+ expect {Parser.new(xml)}.to_not raise_error
202
+
203
+ xml = xml_dedup(output_deterministic: ["foo","bar"])
204
+ expect {Parser.new(xml)}.to raise_error(Parser::MultipleOutputElement, "Only one output element is allowed, 2 found")
205
+
206
+ xml = xml_dedup(output_deterministic: false)
207
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingOutputElement, "Missing output element")
208
+ end
209
+
210
+ it "expects a deterministic attribute on output element" do
211
+ xml = xml_dedup(output_deterministic: "")
212
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingDeterministicAttribute, "Missing attribute 'deterministic' on output element")
213
+ end
214
+ end
215
+ end
@@ -0,0 +1,21 @@
1
+ require 'rpareia'
2
+
3
+ RSpec.configure do |config|
4
+ config.expect_with :rspec do |expectations|
5
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
6
+ end
7
+
8
+ config.disable_monkey_patching!
9
+
10
+ #config.warnings = true
11
+
12
+ if config.files_to_run.one?
13
+ config.default_formatter = 'doc'
14
+ end
15
+
16
+ #config.profile_examples = 10
17
+
18
+ config.order = :random
19
+
20
+ Kernel.srand config.seed
21
+ end
metadata ADDED
@@ -0,0 +1,179 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rpareia
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Michel Boaventura
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-02-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pry
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.10'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.10'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.2'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.2'
69
+ - !ruby/object:Gem::Dependency
70
+ name: guard-rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '4.5'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '4.5'
83
+ - !ruby/object:Gem::Dependency
84
+ name: nokogiri
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '1.6'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '1.6'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rbczmq
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.7'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.7'
111
+ - !ruby/object:Gem::Dependency
112
+ name: gli
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '2.12'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '2.12'
125
+ description: A data deduplication software, based on Pareia
126
+ email:
127
+ - michel.boaventura@gmail.com
128
+ executables:
129
+ - rpareia
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - ".rspec"
134
+ - ".travis.yml"
135
+ - Gemfile
136
+ - Gemfile.lock
137
+ - Guardfile
138
+ - LICENSE.txt
139
+ - README.md
140
+ - Rakefile
141
+ - bin/rpareia
142
+ - lib/rpareia.rb
143
+ - lib/rpareia/blocker.rb
144
+ - lib/rpareia/heartbeat.rb
145
+ - lib/rpareia/parser.rb
146
+ - lib/rpareia/reader.rb
147
+ - lib/rpareia/version.rb
148
+ - rpareia.gemspec
149
+ - spec/fixtures/empty_file.xml
150
+ - spec/rpareia/parser_spec.rb
151
+ - spec/spec_helper.rb
152
+ homepage: http://github.com/michelboaventura/rpareia
153
+ licenses:
154
+ - MIT
155
+ metadata: {}
156
+ post_install_message:
157
+ rdoc_options: []
158
+ require_paths:
159
+ - lib
160
+ required_ruby_version: !ruby/object:Gem::Requirement
161
+ requirements:
162
+ - - ">="
163
+ - !ruby/object:Gem::Version
164
+ version: '0'
165
+ required_rubygems_version: !ruby/object:Gem::Requirement
166
+ requirements:
167
+ - - ">="
168
+ - !ruby/object:Gem::Version
169
+ version: '0'
170
+ requirements: []
171
+ rubyforge_project:
172
+ rubygems_version: 2.4.5
173
+ signing_key:
174
+ specification_version: 4
175
+ summary: Pareia implementation, with ruby and ZeroMQ
176
+ test_files:
177
+ - spec/fixtures/empty_file.xml
178
+ - spec/rpareia/parser_spec.rb
179
+ - spec/spec_helper.rb