rpareia 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 8fb7c8a13b2133209c080eaf3eba626412e71659
4
+ data.tar.gz: e3acad44a34d0ba4d74b746aa82fb09cce9ddf64
5
+ SHA512:
6
+ metadata.gz: 0ab1079409ef2541c80d838925b59b8f8f11d259e993248c326e9f451091906184299870bc14c9364be02b68175f8a85c4b66091a4d81d2bc4a692efbad57d9b
7
+ data.tar.gz: 1a5182f13b2f71fe6ecac5cfe07b6488b789da43c583b9e5245ab84a1e40dec67254a2721f46240faee4ef8addb70974e2c86e3537b99b6b24f8c48784cdf6b0
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --color
2
+ --require spec_helper
3
+ --format doc
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - "2.1.0"
4
+ - "2.2.0"
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in rpareia.gemspec
4
+ gemspec
@@ -0,0 +1,84 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ rpareia (0.0.1)
5
+ gli (~> 2.12)
6
+ nokogiri (~> 1.6.6)
7
+ rbczmq (~> 1.7)
8
+
9
+ GEM
10
+ remote: https://rubygems.org/
11
+ specs:
12
+ celluloid (0.16.0)
13
+ timers (~> 4.0.0)
14
+ coderay (1.1.0)
15
+ diff-lcs (1.2.5)
16
+ ffi (1.9.6)
17
+ formatador (0.2.5)
18
+ gli (2.12.2)
19
+ guard (2.11.1)
20
+ formatador (>= 0.2.4)
21
+ listen (~> 2.7)
22
+ lumberjack (~> 1.0)
23
+ nenv (~> 0.1)
24
+ notiffany (~> 0.0)
25
+ pry (>= 0.9.12)
26
+ shellany (~> 0.0)
27
+ thor (>= 0.18.1)
28
+ guard-compat (1.2.1)
29
+ guard-rspec (4.5.0)
30
+ guard (~> 2.1)
31
+ guard-compat (~> 1.1)
32
+ rspec (>= 2.99.0, < 4.0)
33
+ hitimes (1.2.2)
34
+ listen (2.8.5)
35
+ celluloid (>= 0.15.2)
36
+ rb-fsevent (>= 0.9.3)
37
+ rb-inotify (>= 0.9)
38
+ lumberjack (1.0.9)
39
+ method_source (0.8.2)
40
+ mini_portile (0.6.2)
41
+ nenv (0.2.0)
42
+ nokogiri (1.6.6.2)
43
+ mini_portile (~> 0.6.0)
44
+ notiffany (0.0.3)
45
+ nenv (~> 0.1)
46
+ shellany (~> 0.0)
47
+ pry (0.10.1)
48
+ coderay (~> 1.1.0)
49
+ method_source (~> 0.8.1)
50
+ slop (~> 3.4)
51
+ rake (10.4.2)
52
+ rb-fsevent (0.9.4)
53
+ rb-inotify (0.9.5)
54
+ ffi (>= 0.5.0)
55
+ rbczmq (1.7.8)
56
+ rspec (3.2.0)
57
+ rspec-core (~> 3.2.0)
58
+ rspec-expectations (~> 3.2.0)
59
+ rspec-mocks (~> 3.2.0)
60
+ rspec-core (3.2.0)
61
+ rspec-support (~> 3.2.0)
62
+ rspec-expectations (3.2.0)
63
+ diff-lcs (>= 1.2.0, < 2.0)
64
+ rspec-support (~> 3.2.0)
65
+ rspec-mocks (3.2.0)
66
+ diff-lcs (>= 1.2.0, < 2.0)
67
+ rspec-support (~> 3.2.0)
68
+ rspec-support (3.2.1)
69
+ shellany (0.0.1)
70
+ slop (3.6.0)
71
+ thor (0.19.1)
72
+ timers (4.0.1)
73
+ hitimes
74
+
75
+ PLATFORMS
76
+ ruby
77
+
78
+ DEPENDENCIES
79
+ bundler (~> 1.7)
80
+ guard-rspec (~> 4.5)
81
+ pry (~> 0.10)
82
+ rake (~> 10.0)
83
+ rpareia!
84
+ rspec (~> 3.2)
@@ -0,0 +1,5 @@
1
+ guard :rspec, cmd: "bundle exec rspec" do
2
+ watch(%r{^spec/(.+)/.+_spec\.rb$})
3
+ watch(%r{^lib/(.+)/(.+)\.rb$}) { |m| "spec/#{m[1]}/#{m[2]}_spec.rb" }
4
+ watch('spec/spec_helper.rb') { "spec" }
5
+ end
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2015 Michel Boaventura
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,24 @@
1
+ [![CodeClimate](https://codeclimate.com/github/michelboaventura/rpareia/badges/gpa.svg)](https://codeclimate.com/github/michelboaventura/rpareia)
2
+ [![Travis](https://travis-ci.org/michelboaventura/rpareia.svg)](https://travis-ci.org/michelboaventura/rpareia.svg)
3
+
4
+ # Rpareia
5
+
6
+ Pareia's[1] implementation in Ruby with zeromq:
7
+
8
+ [1] https://github.com/michelboaventura/pareia
9
+
10
+ ## Installation
11
+
12
+ $ gem install rpareia
13
+
14
+ ## Usage
15
+
16
+ TODO: Write usage instructions here
17
+
18
+ ## Contributing
19
+
20
+ 1. Fork it ( https://github.com/michelboaventura/rpareia/fork )
21
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
22
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
23
+ 4. Push to the branch (`git push origin my-new-feature`)
24
+ 5. Create a new Pull Request
@@ -0,0 +1,12 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ begin
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task :default => :spec
9
+ rescue LoadError
10
+ # no rspec available
11
+ # end
12
+ end
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'gli'
4
+ require 'rpareia'
5
+
6
+ include GLI::App
7
+
8
+ program_desc "A data deduplication software"
9
+
10
+ desc "Handles with project files"
11
+ command :project do |c|
12
+ c.desc 'Check a project xml for errors'
13
+ c.flag [:c,:check]
14
+
15
+ c.action do |global_options,options,args|
16
+ if project = options[:c]
17
+ Rpareia::Parser.new(File.open(project,'r').read)
18
+ puts "Project file '#{project}' is OK"
19
+ end
20
+ end
21
+ end
22
+
23
+ exit run(ARGV)
@@ -0,0 +1,6 @@
1
+ require "rpareia/version"
2
+ require "rpareia/parser"
3
+
4
+ module Rpareia
5
+ # Your code goes here...
6
+ end
@@ -0,0 +1,52 @@
1
+ require_relative 'heartbeat'
2
+ require 'rbczmq'
3
+ require 'pry'
4
+
5
+ class Blocker
6
+
7
+ BLOCKER_PORT_START = 2000
8
+ CTX = ZMQ::Context.new
9
+
10
+ def initialize(rank:, debug: false)
11
+ @rank = rank
12
+ @debug = debug
13
+ @port = BLOCKER_PORT_START + @rank
14
+ end
15
+
16
+ def start
17
+ start_hb
18
+ connect
19
+
20
+ total = 0
21
+
22
+ loop do
23
+ msg = @socket.recv
24
+
25
+ next if msg.nil? || msg[0] == "\x00"
26
+
27
+ if msg != "EOF"
28
+ total += 1
29
+ else
30
+ puts "TOTAL=#{total}"
31
+ end
32
+ end
33
+
34
+ end
35
+
36
+ def start_hb
37
+ Thread.new do
38
+ loop do
39
+ Heartbeat.check(addr: "tcp://*:#{2 * @port}", ctx: CTX, type: 'server', debug: @debug)
40
+ end
41
+ end
42
+ end
43
+
44
+ def connect
45
+ @socket = CTX.socket(:ROUTER)
46
+ @socket.verbose = @debug
47
+ @socket.bind("tcp://*:#{@port}")
48
+
49
+ end
50
+ end
51
+
52
+ Blocker.new(rank: ARGV[0].to_i).start
@@ -0,0 +1,17 @@
1
+ class Heartbeat
2
+ def self.check(addr:, ctx:, type:, debug: false)
3
+ if(type == 'client')
4
+ hb = ctx.socket(:REQ)
5
+ hb.verbose = debug
6
+ hb.connect(addr)
7
+ hb.send("")
8
+ hb.close
9
+ else
10
+ hb = ctx.socket(:REP)
11
+ hb.verbose = debug
12
+ hb.bind(addr)
13
+ hb.recv
14
+ hb.close
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,171 @@
1
+ require "bundler/setup"
2
+ require "nokogiri"
3
+ require "pry"
4
+
5
+ module Rpareia
6
+ class Parser
7
+ attr_reader :project
8
+
9
+ class SyntaxError < StandardError; end
10
+ class InvalidTaskError < StandardError; end
11
+ class InvalidNumberOfSources < StandardError; end
12
+ class DuplicatedDataSourceId < StandardError; end
13
+ class MissingDataSourceId < StandardError; end
14
+ class MissingDataSourceFile < StandardError; end
15
+ class InvalidDataSourceFile < StandardError; end
16
+ class InvalidDataSourceType < StandardError; end
17
+ class MissingFieldSeparator < StandardError; end
18
+ class FieldsElementNotFound < StandardError; end
19
+ class MissingFieldName < StandardError; end
20
+ class InvalidFieldType < StandardError; end
21
+ class DuplicatedFieldName < StandardError; end
22
+ class DeterministicLinkageElementNotFound < StandardError; end
23
+ class ConjunctionElementNotFound < StandardError; end
24
+ class MultipleConjunctionElements < StandardError; end
25
+ class MissingPart < StandardError; end
26
+ class MissingFieldName < StandardError; end
27
+ class MissingPartFieldNameOnDataSource < StandardError; end
28
+ class MissingOutputElement < StandardError; end
29
+ class MultipleOutputElement < StandardError; end
30
+ class MissingDeterministicAttribute < StandardError; end
31
+
32
+ def initialize(xml)
33
+ @xml = xml
34
+
35
+ parse
36
+ end
37
+
38
+ private
39
+
40
+ def parse_xml
41
+ @xml = Nokogiri::XML(@xml)
42
+
43
+ raise SyntaxError.exception(@xml.errors.join("\n")) unless @xml.errors.empty?
44
+ end
45
+
46
+ def parse_task
47
+ @project = {name: @xml.xpath("/project/@task").first.value}
48
+ end
49
+
50
+ def find_duplicated(arr)
51
+ arr.detect {|e| arr.rindex(e) != arr.index(e) }
52
+ end
53
+
54
+ def parse_data_sources
55
+ @project[:data_sources] = []
56
+
57
+ @xml.xpath("/project/data-sources/data-source").each do |data_source|
58
+ id = data_source['id'].to_s
59
+ raise MissingDataSourceId.exception if id.empty?
60
+
61
+ file = data_source['file'].to_s
62
+ raise MissingDataSourceFile.exception("Missing file attribute from data source '#{id}'") if file.empty?
63
+ raise InvalidDataSourceFile.exception("File '#{file}' from data source '#{id}' does not exist") unless File.exist?(file)
64
+
65
+ type = data_source['type'].to_s
66
+ raise InvalidDataSourceType.exception("Data source type '#{type}' not supported") if type != "delimited"
67
+
68
+ field_separator = data_source['field-separator'].to_s
69
+ raise MissingFieldSeparator.exception("Missing field separator from data source '#{id}'") if field_separator.empty?
70
+
71
+ fields = data_source.xpath("fields/field")
72
+ raise FieldsElementNotFound.exception("Element 'field' not found on data source '#{id}'") if fields.empty?
73
+
74
+ my_fields = []
75
+ fields.each do |field|
76
+ name = field['name']
77
+ raise MissingFieldName.exception("Attribute 'name' not found on data source '#{id}'") if name.empty?
78
+
79
+ type = field['type']
80
+ raise InvalidFieldType.exception("Invalid type '#{type}' from field '#{name}', data source '#{id}'") unless ['int', 'string'].include? type
81
+ my_fields << {name: name, type: type}
82
+ end
83
+
84
+ if dup = find_duplicated(my_fields.map{|f| f[:name]})
85
+ raise DuplicatedFieldName.exception("Duplicated field name '#{dup}' on data source '#{id}'")
86
+ end
87
+
88
+ @project[:data_sources] << {
89
+ id: id,
90
+ file: file,
91
+ fields: my_fields
92
+ }
93
+ end
94
+
95
+ if dup = find_duplicated(@project[:data_sources].map{|el| el[:id]})
96
+ raise DuplicatedDataSourceId.exception("Duplicated data source id '#{dup}'")
97
+ end
98
+
99
+ case @project[:name]
100
+ when 'linkage'
101
+ if @project[:data_sources].size != 2
102
+ raise InvalidNumberOfSources.exception("Linkage: expected two data-source, #{@project[:data_sources].size} given")
103
+ end
104
+ when 'deduplication'
105
+ if @project[:data_sources].size != 1
106
+ raise InvalidNumberOfSources.exception("Deduplication: expected one data-sources, #{@project[:data_sources].size} given")
107
+ end
108
+ else
109
+ raise InvalidTaskError.exception("Invalid task: '#{@project[:name]}'")
110
+ end
111
+ end
112
+
113
+ def parse_deterministic_linkage
114
+ deterministic_linkage = @xml.xpath("/project/deterministic-linkage")
115
+ raise DeterministicLinkageElementNotFound.exception("Missing deterministic-linkage element") if deterministic_linkage.empty?
116
+
117
+ conjunction = deterministic_linkage.xpath("conjunction")
118
+ size = conjunction.size
119
+
120
+ case size
121
+ when 0
122
+ raise ConjunctionElementNotFound.exception("Missing conjunction element")
123
+ when 1
124
+ else
125
+ raise MultipleConjunctionElements.exception("Only one conjunction element is allowed, #{size} found")
126
+ end
127
+
128
+ parts = conjunction.xpath("part")
129
+ raise MissingPart.exception("At leas one part element is required") if parts.empty?
130
+
131
+ @project[:parts] = []
132
+ parts.each do |part|
133
+ field_name = part['field-name'].to_s
134
+ raise MissingFieldName.exception("Missing attribute field-name on part element") if field_name.empty?
135
+
136
+ @project[:parts] << {field_name: field_name}
137
+ end
138
+
139
+ @project[:parts].each do |part|
140
+ @project[:data_sources].each do |data_source|
141
+ unless data_source[:fields].map{|e| e[:name]}.include?(part[:field_name])
142
+ raise MissingPartFieldNameOnDataSource.exception("Field name '#{part[:field_name]}' not found on data source '#{data_source[:id]}'")
143
+ end
144
+ end
145
+ end
146
+ end
147
+
148
+ def parse_output
149
+ output = @xml.xpath("/project/output")
150
+
151
+ case size = output.size
152
+ when 0
153
+ raise MissingOutputElement.exception("Missing output element")
154
+ when 1
155
+ else
156
+ raise MultipleOutputElement.exception("Only one output element is allowed, #{size} found")
157
+ end
158
+
159
+ @project[:output] = output.first['deterministic']
160
+ raise MissingDeterministicAttribute.exception("Missing attribute 'deterministic' on output element") unless @project[:output]
161
+ end
162
+
163
+ def parse
164
+ parse_xml
165
+ parse_task
166
+ parse_data_sources
167
+ parse_deterministic_linkage
168
+ parse_output
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,91 @@
1
+ require 'bundler/setup'
2
+ require 'rbczmq'
3
+ require_relative 'heartbeat'
4
+ require 'pry'
5
+
6
+ class Reader
7
+ BLOCKER_PORT_START = 2000
8
+ CTX = ZMQ::Context.new
9
+
10
+ def initialize(num_readers:, num_blocks:, rank:, file:, debug: false, addr: "tcp://localhost")
11
+ @num_readers = num_readers
12
+ @num_blocks = num_blocks
13
+ @rank = rank
14
+ @file = file
15
+ @debug = debug
16
+ @addr = addr
17
+ end
18
+
19
+ def start
20
+ start_connections
21
+ read_database
22
+ end
23
+
24
+ def start_connections
25
+
26
+ @connections = []
27
+
28
+ 0.upto(@num_blocks - 1) do |i|
29
+ port = BLOCKER_PORT_START + i
30
+ Heartbeat.check(addr: "#{@addr}:#{2 * port}", ctx: CTX, type: 'client', debug: @debug)
31
+
32
+ @connections << create_socket(port)
33
+ end
34
+ end
35
+
36
+ def create_socket(port)
37
+ socket = CTX.socket(:DEALER)
38
+ socket.verbose = @debug
39
+ socket.connect("#{@addr}:#{port}")
40
+ return socket
41
+ end
42
+
43
+ def read_database
44
+ i = -1
45
+ File.open(@file,'r').each_line do |line|
46
+ i += 1
47
+ next if i % @num_readers != @rank
48
+
49
+ pair = create_pair(line.strip)
50
+ send_pair(pair)
51
+ end
52
+ close_connections
53
+ puts "EOF"
54
+ end
55
+
56
+ def close_connections
57
+ @connections.each do |c|
58
+ c.send("EOF")
59
+ c.close
60
+ end
61
+ CTX.destroy
62
+ end
63
+
64
+ def send_pair(pair)
65
+ block_id = which_block(pair.last)
66
+
67
+ @connections[block_id].send(pair.join(','))
68
+ end
69
+
70
+ def create_hash(input)
71
+ input.join
72
+ end
73
+
74
+ def which_block(key)
75
+ key.bytes.first % @num_blocks
76
+ end
77
+
78
+ def create_pair(input)
79
+ line = input.split(',')
80
+ [line.first, create_hash(line[1..-1])]
81
+ end
82
+ end
83
+
84
+ Reader.new(
85
+ debug: false,
86
+ num_readers: 2,
87
+ num_blocks: 1,
88
+ rank: ARGV[0].to_i,
89
+ file: (ARGV[1] || "input/10m.csv"),
90
+ addr: (ARGV[2] || "tcp://localhost")
91
+ ).start
@@ -0,0 +1,3 @@
1
+ module Rpareia
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,28 @@
1
+ require File.join([File.dirname(__FILE__),'lib','rpareia','version.rb'])
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "rpareia"
5
+ s.description = "A data deduplication software, based on Pareia"
6
+ s.version = Rpareia::VERSION
7
+ s.authors = ["Michel Boaventura"]
8
+ s.email = ["michel.boaventura@gmail.com"]
9
+ s.homepage = "http://github.com/michelboaventura/rpareia"
10
+ s.platform = Gem::Platform::RUBY
11
+ s.summary = %q{Pareia implementation, with ruby and ZeroMQ}
12
+ s.files = `git ls-files -z`.split("\x0")
13
+ s.license = "MIT"
14
+ s.require_paths = ["lib"]
15
+ s.bindir = 'bin'
16
+ s.executables = ['rpareia']
17
+ s.test_files = s.files.grep(%r{^(spec)/})
18
+
19
+ s.add_development_dependency "bundler", "~> 1.7"
20
+ s.add_development_dependency "rake", "~> 10.0"
21
+ s.add_development_dependency "pry", "~> 0.10"
22
+ s.add_development_dependency "rspec", "~> 3.2"
23
+ s.add_development_dependency "guard-rspec", "~> 4.5"
24
+
25
+ s.add_dependency "nokogiri", "~> 1.6"
26
+ s.add_dependency "rbczmq", "~> 1.7"
27
+ s.add_dependency "gli", "~> 2.12"
28
+ end
File without changes
@@ -0,0 +1,215 @@
1
+ include Rpareia
2
+
3
+ DEFAULT_FILE = "spec/fixtures/empty_file.xml"
4
+
5
+ def xml_dedup(task: 'deduplication', ids: [0], files: [DEFAULT_FILE], types: ['delimited'], field_separators: ["\t"], fields: [[{name: 'id', type: 'int'}]], parts: [{:'field-name' => 'id'}], deterministic: true, conjunction: 1, output_deterministic: ['foo'])
6
+ xml(task: task, ids: ids, files: files, types: types, field_separators: field_separators, fields: fields, parts: parts, deterministic: deterministic, conjunction: conjunction, output_deterministic: output_deterministic)
7
+ end
8
+
9
+ def xml_linkage(task: 'linkage', ids: [0,1], files: [DEFAULT_FILE, DEFAULT_FILE], types: ['delimited', 'delimited'], field_separators: ["\t", "\t"], fields: [[{name: 'id', type: 'int'}], [{name: 'id', type: 'int'}]], parts: [{:'field-name' => 'id'}], deterministic: true, conjunction: 1, output_deterministic: ['foo'])
10
+
11
+ xml(task: task, ids: ids, files: files, types: types, field_separators: field_separators, fields: fields, parts: parts, deterministic: deterministic, conjunction: conjunction, output_deterministic: output_deterministic)
12
+ end
13
+
14
+ def xml(task:, ids: [], files: nil, types: nil, field_separators: nil, fields: nil, parts: nil, deterministic: true, conjunction: 1, output_deterministic: ['foo'])
15
+ Nokogiri::XML::Builder.new do |xml|
16
+ xml.project(task: task) {
17
+ xml.send('data-sources') {
18
+ ids.each_with_index do |id,i|
19
+ xml.send('data-source', id: id, file: files[i], type: types[i], :'field-separator' => field_separators[i]) {
20
+ xml.fields {
21
+ next unless fields[i]
22
+ fields[i].each do |field|
23
+ xml.field(field)
24
+ end
25
+ }
26
+ }
27
+ end
28
+ }
29
+ if deterministic
30
+ xml.send('deterministic-linkage') {
31
+ conjunction.times do
32
+ xml.conjunction {
33
+ parts.each do |part|
34
+ xml.part(part)
35
+ end
36
+ }
37
+ end
38
+ }
39
+ end
40
+ if output_deterministic
41
+ if output_deterministic.empty?
42
+ xml.output
43
+ else
44
+ output_deterministic.each do |out|
45
+ xml.output(deterministic: out)
46
+ end
47
+ end
48
+ end
49
+ }
50
+ end.to_xml
51
+ end
52
+
53
+ RSpec.describe Parser do
54
+ context "initialization" do
55
+ it "expects a xml string as argument" do
56
+ expect {Parser.new}.to raise_error(ArgumentError)
57
+ end
58
+ it "complains about invalid XML" do
59
+ bad_xml = '<?xml version="1.0 encoding="UTF-8"?>'
60
+ expect {Parser.new(bad_xml)}.to raise_error(Parser::SyntaxError)
61
+ end
62
+ end
63
+
64
+ context "tasks" do
65
+ it "accepts only valid tasks" do
66
+ xml = xml_dedup
67
+ expect {Parser.new(xml)}.to_not raise_error
68
+
69
+ xml = xml_linkage
70
+ expect {Parser.new(xml)}.to_not raise_error
71
+ end
72
+
73
+ it "rejects invalid tasks" do
74
+ xml = xml_dedup(task: 'foo')
75
+ expect {Parser.new(xml)}.to raise_error(Parser::InvalidTaskError, "Invalid task: 'foo'")
76
+ end
77
+ end
78
+
79
+ context "data source" do
80
+ it "accepts only one data sources when deduplicating" do
81
+ xml = xml_linkage(task: 'deduplication')
82
+ expect {Parser.new(xml)}.to raise_error(Parser::InvalidNumberOfSources, "Deduplication: expected one data-sources, 2 given")
83
+ end
84
+
85
+ it "accepts only two data sources when linking" do
86
+ xml = xml_dedup(task: 'linkage')
87
+ expect {Parser.new(xml)}.to raise_error(Parser::InvalidNumberOfSources, "Linkage: expected two data-source, 1 given")
88
+ end
89
+
90
+ it "rejects data sources without id" do
91
+ xml = xml_dedup(ids: [nil])
92
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingDataSourceId)
93
+ end
94
+
95
+ it "rejects two data sources with same id" do
96
+ xml = xml_linkage(ids: [1,1])
97
+ expect {Parser.new(xml)}.to raise_error(Parser::DuplicatedDataSourceId, "Duplicated data source id '1'")
98
+ end
99
+
100
+ it "expects a file name" do
101
+ xml = xml_dedup(files: [''])
102
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingDataSourceFile, "Missing file attribute from data source '0'")
103
+ end
104
+
105
+ it "rejects invalid file names" do
106
+ xml = xml_dedup(files: ['invalid'])
107
+ expect {Parser.new(xml)}.to raise_error(Parser::InvalidDataSourceFile, "File 'invalid' from data source '0' does not exist")
108
+ end
109
+
110
+ it "accepts only 'delimiter' type" do
111
+ xml = xml_dedup(types: ['delimited'])
112
+ expect {Parser.new(xml)}.to_not raise_error
113
+
114
+ xml = xml_dedup(types: ['foo'])
115
+ expect {Parser.new(xml)}.to raise_error(Parser::InvalidDataSourceType, "Data source type 'foo' not supported")
116
+ end
117
+
118
+ it "expects a field-separator" do
119
+ xml = xml_dedup(field_separators: ["\t"])
120
+ expect {Parser.new(xml)}.to_not raise_error
121
+
122
+ xml = xml_dedup(field_separators: [""])
123
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingFieldSeparator, "Missing field separator from data source '0'")
124
+ end
125
+
126
+ it "expects a 'fields' entry" do
127
+ xml = xml_dedup(fields: [[{name: 'id', type: 'int'}]], parts: [{:'field-name' => 'id'}])
128
+ expect {Parser.new(xml)}.to_not raise_error
129
+
130
+ xml = xml_dedup(fields: [])
131
+ expect {Parser.new(xml)}.to raise_error(Parser::FieldsElementNotFound)
132
+ end
133
+
134
+ it "expect a name attribute on field element" do
135
+ xml = xml_dedup(fields: [[{name: '', type: 'int'}]])
136
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingFieldName)
137
+ end
138
+
139
+ it "rejects duplicated names on the same data source" do
140
+ xml = xml_dedup(fields: [[{name: 'foo', type: 'int'},{name: 'foo', type: 'int'}]])
141
+ expect {Parser.new(xml)}.to raise_error(Parser::DuplicatedFieldName, "Duplicated field name 'foo' on data source '0'")
142
+ end
143
+
144
+ it "expect a valid field type" do
145
+ ['int', 'string'].each do |type|
146
+ xml = xml_dedup(fields: [[{name: 'foo', type: type}]], parts: [{:'field-name' => 'foo'}])
147
+ expect {Parser.new(xml)}.to_not raise_error
148
+ end
149
+
150
+ xml = xml_dedup(fields: [[{name: 'foo', type: 'bar'}]])
151
+ expect {Parser.new(xml)}.to raise_error(Parser::InvalidFieldType, "Invalid type 'bar' from field 'foo', data source '0'")
152
+ end
153
+ end
154
+
155
+ context "deterministic" do
156
+ it "expects a deterministic-linkage element" do
157
+ xml = xml_dedup(deterministic: false)
158
+ expect {Parser.new(xml)}.to raise_error(Parser::DeterministicLinkageElementNotFound, "Missing deterministic-linkage element")
159
+ end
160
+
161
+ it "expects a single conjunction element" do
162
+ xml = xml_dedup(conjunction: 0)
163
+ expect {Parser.new(xml)}.to raise_error(Parser::ConjunctionElementNotFound, "Missing conjunction element")
164
+
165
+ xml = xml_dedup(conjunction: 1)
166
+ expect {Parser.new(xml)}.to_not raise_error
167
+
168
+ conjunction = 2
169
+ xml = xml_dedup(conjunction: conjunction)
170
+ expect {Parser.new(xml)}.to raise_error(Parser::MultipleConjunctionElements, "Only one conjunction element is allowed, #{conjunction} found")
171
+ end
172
+
173
+ it "expects at leas one part element" do
174
+ xml = xml_dedup(fields: [[{name: 'foo', type: 'int'}]], parts: [{:'field-name' => 'foo'}])
175
+ expect {Parser.new(xml)}.to_not raise_error
176
+
177
+ xml = xml_dedup(parts: [])
178
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingPart, "At leas one part element is required")
179
+ end
180
+
181
+ it "expects a field-name on part elements" do
182
+ xml = xml_dedup(fields: [[{name: 'foo', type: 'string'}]], parts: [{:'field-name' => 'foo'}])
183
+ expect {Parser.new(xml)}.to_not raise_error
184
+
185
+ xml = xml_dedup(parts: [{not_field_name: 'foo'}])
186
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingFieldName, "Missing attribute field-name on part element")
187
+ end
188
+
189
+ it "accepts only field-names present on all data sources" do
190
+ xml = xml_linkage(fields: [[{name: 'foo', type: 'string'}],[{name: 'foo', type: 'string'}]], parts: [{:'field-name' => 'foo'}])
191
+ expect {Parser.new(xml)}.to_not raise_error
192
+
193
+ xml = xml_linkage(fields: [[{name: 'foo', type: 'string'}],[{name: 'bar', type: 'string'}]], parts: [{:'field-name' => 'foo'}])
194
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingPartFieldNameOnDataSource, "Field name 'foo' not found on data source '1'")
195
+ end
196
+ end
197
+
198
+ context "output" do
199
+ it "expects a single 'output' element" do
200
+ xml = xml_dedup(output_deterministic: ["foo"])
201
+ expect {Parser.new(xml)}.to_not raise_error
202
+
203
+ xml = xml_dedup(output_deterministic: ["foo","bar"])
204
+ expect {Parser.new(xml)}.to raise_error(Parser::MultipleOutputElement, "Only one output element is allowed, 2 found")
205
+
206
+ xml = xml_dedup(output_deterministic: false)
207
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingOutputElement, "Missing output element")
208
+ end
209
+
210
+ it "expects a deterministic attribute on output element" do
211
+ xml = xml_dedup(output_deterministic: "")
212
+ expect {Parser.new(xml)}.to raise_error(Parser::MissingDeterministicAttribute, "Missing attribute 'deterministic' on output element")
213
+ end
214
+ end
215
+ end
@@ -0,0 +1,21 @@
1
+ require 'rpareia'
2
+
3
+ RSpec.configure do |config|
4
+ config.expect_with :rspec do |expectations|
5
+ expectations.include_chain_clauses_in_custom_matcher_descriptions = true
6
+ end
7
+
8
+ config.disable_monkey_patching!
9
+
10
+ #config.warnings = true
11
+
12
+ if config.files_to_run.one?
13
+ config.default_formatter = 'doc'
14
+ end
15
+
16
+ #config.profile_examples = 10
17
+
18
+ config.order = :random
19
+
20
+ Kernel.srand config.seed
21
+ end
metadata ADDED
@@ -0,0 +1,179 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rpareia
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Michel Boaventura
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-02-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.7'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.7'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pry
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.10'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.10'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3.2'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3.2'
69
+ - !ruby/object:Gem::Dependency
70
+ name: guard-rspec
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '4.5'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '4.5'
83
+ - !ruby/object:Gem::Dependency
84
+ name: nokogiri
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '1.6'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '1.6'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rbczmq
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '1.7'
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '1.7'
111
+ - !ruby/object:Gem::Dependency
112
+ name: gli
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '2.12'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '2.12'
125
+ description: A data deduplication software, based on Pareia
126
+ email:
127
+ - michel.boaventura@gmail.com
128
+ executables:
129
+ - rpareia
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - ".rspec"
134
+ - ".travis.yml"
135
+ - Gemfile
136
+ - Gemfile.lock
137
+ - Guardfile
138
+ - LICENSE.txt
139
+ - README.md
140
+ - Rakefile
141
+ - bin/rpareia
142
+ - lib/rpareia.rb
143
+ - lib/rpareia/blocker.rb
144
+ - lib/rpareia/heartbeat.rb
145
+ - lib/rpareia/parser.rb
146
+ - lib/rpareia/reader.rb
147
+ - lib/rpareia/version.rb
148
+ - rpareia.gemspec
149
+ - spec/fixtures/empty_file.xml
150
+ - spec/rpareia/parser_spec.rb
151
+ - spec/spec_helper.rb
152
+ homepage: http://github.com/michelboaventura/rpareia
153
+ licenses:
154
+ - MIT
155
+ metadata: {}
156
+ post_install_message:
157
+ rdoc_options: []
158
+ require_paths:
159
+ - lib
160
+ required_ruby_version: !ruby/object:Gem::Requirement
161
+ requirements:
162
+ - - ">="
163
+ - !ruby/object:Gem::Version
164
+ version: '0'
165
+ required_rubygems_version: !ruby/object:Gem::Requirement
166
+ requirements:
167
+ - - ">="
168
+ - !ruby/object:Gem::Version
169
+ version: '0'
170
+ requirements: []
171
+ rubyforge_project:
172
+ rubygems_version: 2.4.5
173
+ signing_key:
174
+ specification_version: 4
175
+ summary: Pareia implementation, with ruby and ZeroMQ
176
+ test_files:
177
+ - spec/fixtures/empty_file.xml
178
+ - spec/rpareia/parser_spec.rb
179
+ - spec/spec_helper.rb