rpareia 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +84 -0
- data/Guardfile +5 -0
- data/LICENSE.txt +22 -0
- data/README.md +24 -0
- data/Rakefile +12 -0
- data/bin/rpareia +23 -0
- data/lib/rpareia.rb +6 -0
- data/lib/rpareia/blocker.rb +52 -0
- data/lib/rpareia/heartbeat.rb +17 -0
- data/lib/rpareia/parser.rb +171 -0
- data/lib/rpareia/reader.rb +91 -0
- data/lib/rpareia/version.rb +3 -0
- data/rpareia.gemspec +28 -0
- data/spec/fixtures/empty_file.xml +0 -0
- data/spec/rpareia/parser_spec.rb +215 -0
- data/spec/spec_helper.rb +21 -0
- metadata +179 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8fb7c8a13b2133209c080eaf3eba626412e71659
|
4
|
+
data.tar.gz: e3acad44a34d0ba4d74b746aa82fb09cce9ddf64
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0ab1079409ef2541c80d838925b59b8f8f11d259e993248c326e9f451091906184299870bc14c9364be02b68175f8a85c4b66091a4d81d2bc4a692efbad57d9b
|
7
|
+
data.tar.gz: 1a5182f13b2f71fe6ecac5cfe07b6488b789da43c583b9e5245ab84a1e40dec67254a2721f46240faee4ef8addb70974e2c86e3537b99b6b24f8c48784cdf6b0
|
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,84 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
rpareia (0.0.1)
|
5
|
+
gli (~> 2.12)
|
6
|
+
nokogiri (~> 1.6.6)
|
7
|
+
rbczmq (~> 1.7)
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
celluloid (0.16.0)
|
13
|
+
timers (~> 4.0.0)
|
14
|
+
coderay (1.1.0)
|
15
|
+
diff-lcs (1.2.5)
|
16
|
+
ffi (1.9.6)
|
17
|
+
formatador (0.2.5)
|
18
|
+
gli (2.12.2)
|
19
|
+
guard (2.11.1)
|
20
|
+
formatador (>= 0.2.4)
|
21
|
+
listen (~> 2.7)
|
22
|
+
lumberjack (~> 1.0)
|
23
|
+
nenv (~> 0.1)
|
24
|
+
notiffany (~> 0.0)
|
25
|
+
pry (>= 0.9.12)
|
26
|
+
shellany (~> 0.0)
|
27
|
+
thor (>= 0.18.1)
|
28
|
+
guard-compat (1.2.1)
|
29
|
+
guard-rspec (4.5.0)
|
30
|
+
guard (~> 2.1)
|
31
|
+
guard-compat (~> 1.1)
|
32
|
+
rspec (>= 2.99.0, < 4.0)
|
33
|
+
hitimes (1.2.2)
|
34
|
+
listen (2.8.5)
|
35
|
+
celluloid (>= 0.15.2)
|
36
|
+
rb-fsevent (>= 0.9.3)
|
37
|
+
rb-inotify (>= 0.9)
|
38
|
+
lumberjack (1.0.9)
|
39
|
+
method_source (0.8.2)
|
40
|
+
mini_portile (0.6.2)
|
41
|
+
nenv (0.2.0)
|
42
|
+
nokogiri (1.6.6.2)
|
43
|
+
mini_portile (~> 0.6.0)
|
44
|
+
notiffany (0.0.3)
|
45
|
+
nenv (~> 0.1)
|
46
|
+
shellany (~> 0.0)
|
47
|
+
pry (0.10.1)
|
48
|
+
coderay (~> 1.1.0)
|
49
|
+
method_source (~> 0.8.1)
|
50
|
+
slop (~> 3.4)
|
51
|
+
rake (10.4.2)
|
52
|
+
rb-fsevent (0.9.4)
|
53
|
+
rb-inotify (0.9.5)
|
54
|
+
ffi (>= 0.5.0)
|
55
|
+
rbczmq (1.7.8)
|
56
|
+
rspec (3.2.0)
|
57
|
+
rspec-core (~> 3.2.0)
|
58
|
+
rspec-expectations (~> 3.2.0)
|
59
|
+
rspec-mocks (~> 3.2.0)
|
60
|
+
rspec-core (3.2.0)
|
61
|
+
rspec-support (~> 3.2.0)
|
62
|
+
rspec-expectations (3.2.0)
|
63
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
64
|
+
rspec-support (~> 3.2.0)
|
65
|
+
rspec-mocks (3.2.0)
|
66
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
67
|
+
rspec-support (~> 3.2.0)
|
68
|
+
rspec-support (3.2.1)
|
69
|
+
shellany (0.0.1)
|
70
|
+
slop (3.6.0)
|
71
|
+
thor (0.19.1)
|
72
|
+
timers (4.0.1)
|
73
|
+
hitimes
|
74
|
+
|
75
|
+
PLATFORMS
|
76
|
+
ruby
|
77
|
+
|
78
|
+
DEPENDENCIES
|
79
|
+
bundler (~> 1.7)
|
80
|
+
guard-rspec (~> 4.5)
|
81
|
+
pry (~> 0.10)
|
82
|
+
rake (~> 10.0)
|
83
|
+
rpareia!
|
84
|
+
rspec (~> 3.2)
|
data/Guardfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 Michel Boaventura
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
[](https://codeclimate.com/github/michelboaventura/rpareia)
|
2
|
+
[](https://travis-ci.org/michelboaventura/rpareia.svg)
|
3
|
+
|
4
|
+
# Rpareia
|
5
|
+
|
6
|
+
Pareia's[1] implementation in Ruby with zeromq:
|
7
|
+
|
8
|
+
[1] https://github.com/michelboaventura/pareia
|
9
|
+
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
$ gem install rpareia
|
13
|
+
|
14
|
+
## Usage
|
15
|
+
|
16
|
+
TODO: Write usage instructions here
|
17
|
+
|
18
|
+
## Contributing
|
19
|
+
|
20
|
+
1. Fork it ( https://github.com/michelboaventura/rpareia/fork )
|
21
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
22
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
23
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
24
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/bin/rpareia
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'gli'
|
4
|
+
require 'rpareia'
|
5
|
+
|
6
|
+
include GLI::App
|
7
|
+
|
8
|
+
program_desc "A data deduplication software"
|
9
|
+
|
10
|
+
desc "Handles with project files"
|
11
|
+
command :project do |c|
|
12
|
+
c.desc 'Check a project xml for errors'
|
13
|
+
c.flag [:c,:check]
|
14
|
+
|
15
|
+
c.action do |global_options,options,args|
|
16
|
+
if project = options[:c]
|
17
|
+
Rpareia::Parser.new(File.open(project,'r').read)
|
18
|
+
puts "Project file '#{project}' is OK"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
exit run(ARGV)
|
data/lib/rpareia.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
require_relative 'heartbeat'
|
2
|
+
require 'rbczmq'
|
3
|
+
require 'pry'
|
4
|
+
|
5
|
+
class Blocker
|
6
|
+
|
7
|
+
BLOCKER_PORT_START = 2000
|
8
|
+
CTX = ZMQ::Context.new
|
9
|
+
|
10
|
+
def initialize(rank:, debug: false)
|
11
|
+
@rank = rank
|
12
|
+
@debug = debug
|
13
|
+
@port = BLOCKER_PORT_START + @rank
|
14
|
+
end
|
15
|
+
|
16
|
+
def start
|
17
|
+
start_hb
|
18
|
+
connect
|
19
|
+
|
20
|
+
total = 0
|
21
|
+
|
22
|
+
loop do
|
23
|
+
msg = @socket.recv
|
24
|
+
|
25
|
+
next if msg.nil? || msg[0] == "\x00"
|
26
|
+
|
27
|
+
if msg != "EOF"
|
28
|
+
total += 1
|
29
|
+
else
|
30
|
+
puts "TOTAL=#{total}"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
def start_hb
|
37
|
+
Thread.new do
|
38
|
+
loop do
|
39
|
+
Heartbeat.check(addr: "tcp://*:#{2 * @port}", ctx: CTX, type: 'server', debug: @debug)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def connect
|
45
|
+
@socket = CTX.socket(:ROUTER)
|
46
|
+
@socket.verbose = @debug
|
47
|
+
@socket.bind("tcp://*:#{@port}")
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
Blocker.new(rank: ARGV[0].to_i).start
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class Heartbeat
|
2
|
+
def self.check(addr:, ctx:, type:, debug: false)
|
3
|
+
if(type == 'client')
|
4
|
+
hb = ctx.socket(:REQ)
|
5
|
+
hb.verbose = debug
|
6
|
+
hb.connect(addr)
|
7
|
+
hb.send("")
|
8
|
+
hb.close
|
9
|
+
else
|
10
|
+
hb = ctx.socket(:REP)
|
11
|
+
hb.verbose = debug
|
12
|
+
hb.bind(addr)
|
13
|
+
hb.recv
|
14
|
+
hb.close
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,171 @@
|
|
1
|
+
require "bundler/setup"
|
2
|
+
require "nokogiri"
|
3
|
+
require "pry"
|
4
|
+
|
5
|
+
module Rpareia
|
6
|
+
class Parser
|
7
|
+
attr_reader :project
|
8
|
+
|
9
|
+
class SyntaxError < StandardError; end
|
10
|
+
class InvalidTaskError < StandardError; end
|
11
|
+
class InvalidNumberOfSources < StandardError; end
|
12
|
+
class DuplicatedDataSourceId < StandardError; end
|
13
|
+
class MissingDataSourceId < StandardError; end
|
14
|
+
class MissingDataSourceFile < StandardError; end
|
15
|
+
class InvalidDataSourceFile < StandardError; end
|
16
|
+
class InvalidDataSourceType < StandardError; end
|
17
|
+
class MissingFieldSeparator < StandardError; end
|
18
|
+
class FieldsElementNotFound < StandardError; end
|
19
|
+
class MissingFieldName < StandardError; end
|
20
|
+
class InvalidFieldType < StandardError; end
|
21
|
+
class DuplicatedFieldName < StandardError; end
|
22
|
+
class DeterministicLinkageElementNotFound < StandardError; end
|
23
|
+
class ConjunctionElementNotFound < StandardError; end
|
24
|
+
class MultipleConjunctionElements < StandardError; end
|
25
|
+
class MissingPart < StandardError; end
|
26
|
+
class MissingFieldName < StandardError; end
|
27
|
+
class MissingPartFieldNameOnDataSource < StandardError; end
|
28
|
+
class MissingOutputElement < StandardError; end
|
29
|
+
class MultipleOutputElement < StandardError; end
|
30
|
+
class MissingDeterministicAttribute < StandardError; end
|
31
|
+
|
32
|
+
def initialize(xml)
|
33
|
+
@xml = xml
|
34
|
+
|
35
|
+
parse
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
def parse_xml
|
41
|
+
@xml = Nokogiri::XML(@xml)
|
42
|
+
|
43
|
+
raise SyntaxError.exception(@xml.errors.join("\n")) unless @xml.errors.empty?
|
44
|
+
end
|
45
|
+
|
46
|
+
def parse_task
|
47
|
+
@project = {name: @xml.xpath("/project/@task").first.value}
|
48
|
+
end
|
49
|
+
|
50
|
+
def find_duplicated(arr)
|
51
|
+
arr.detect {|e| arr.rindex(e) != arr.index(e) }
|
52
|
+
end
|
53
|
+
|
54
|
+
def parse_data_sources
|
55
|
+
@project[:data_sources] = []
|
56
|
+
|
57
|
+
@xml.xpath("/project/data-sources/data-source").each do |data_source|
|
58
|
+
id = data_source['id'].to_s
|
59
|
+
raise MissingDataSourceId.exception if id.empty?
|
60
|
+
|
61
|
+
file = data_source['file'].to_s
|
62
|
+
raise MissingDataSourceFile.exception("Missing file attribute from data source '#{id}'") if file.empty?
|
63
|
+
raise InvalidDataSourceFile.exception("File '#{file}' from data source '#{id}' does not exist") unless File.exist?(file)
|
64
|
+
|
65
|
+
type = data_source['type'].to_s
|
66
|
+
raise InvalidDataSourceType.exception("Data source type '#{type}' not supported") if type != "delimited"
|
67
|
+
|
68
|
+
field_separator = data_source['field-separator'].to_s
|
69
|
+
raise MissingFieldSeparator.exception("Missing field separator from data source '#{id}'") if field_separator.empty?
|
70
|
+
|
71
|
+
fields = data_source.xpath("fields/field")
|
72
|
+
raise FieldsElementNotFound.exception("Element 'field' not found on data source '#{id}'") if fields.empty?
|
73
|
+
|
74
|
+
my_fields = []
|
75
|
+
fields.each do |field|
|
76
|
+
name = field['name']
|
77
|
+
raise MissingFieldName.exception("Attribute 'name' not found on data source '#{id}'") if name.empty?
|
78
|
+
|
79
|
+
type = field['type']
|
80
|
+
raise InvalidFieldType.exception("Invalid type '#{type}' from field '#{name}', data source '#{id}'") unless ['int', 'string'].include? type
|
81
|
+
my_fields << {name: name, type: type}
|
82
|
+
end
|
83
|
+
|
84
|
+
if dup = find_duplicated(my_fields.map{|f| f[:name]})
|
85
|
+
raise DuplicatedFieldName.exception("Duplicated field name '#{dup}' on data source '#{id}'")
|
86
|
+
end
|
87
|
+
|
88
|
+
@project[:data_sources] << {
|
89
|
+
id: id,
|
90
|
+
file: file,
|
91
|
+
fields: my_fields
|
92
|
+
}
|
93
|
+
end
|
94
|
+
|
95
|
+
if dup = find_duplicated(@project[:data_sources].map{|el| el[:id]})
|
96
|
+
raise DuplicatedDataSourceId.exception("Duplicated data source id '#{dup}'")
|
97
|
+
end
|
98
|
+
|
99
|
+
case @project[:name]
|
100
|
+
when 'linkage'
|
101
|
+
if @project[:data_sources].size != 2
|
102
|
+
raise InvalidNumberOfSources.exception("Linkage: expected two data-source, #{@project[:data_sources].size} given")
|
103
|
+
end
|
104
|
+
when 'deduplication'
|
105
|
+
if @project[:data_sources].size != 1
|
106
|
+
raise InvalidNumberOfSources.exception("Deduplication: expected one data-sources, #{@project[:data_sources].size} given")
|
107
|
+
end
|
108
|
+
else
|
109
|
+
raise InvalidTaskError.exception("Invalid task: '#{@project[:name]}'")
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def parse_deterministic_linkage
|
114
|
+
deterministic_linkage = @xml.xpath("/project/deterministic-linkage")
|
115
|
+
raise DeterministicLinkageElementNotFound.exception("Missing deterministic-linkage element") if deterministic_linkage.empty?
|
116
|
+
|
117
|
+
conjunction = deterministic_linkage.xpath("conjunction")
|
118
|
+
size = conjunction.size
|
119
|
+
|
120
|
+
case size
|
121
|
+
when 0
|
122
|
+
raise ConjunctionElementNotFound.exception("Missing conjunction element")
|
123
|
+
when 1
|
124
|
+
else
|
125
|
+
raise MultipleConjunctionElements.exception("Only one conjunction element is allowed, #{size} found")
|
126
|
+
end
|
127
|
+
|
128
|
+
parts = conjunction.xpath("part")
|
129
|
+
raise MissingPart.exception("At leas one part element is required") if parts.empty?
|
130
|
+
|
131
|
+
@project[:parts] = []
|
132
|
+
parts.each do |part|
|
133
|
+
field_name = part['field-name'].to_s
|
134
|
+
raise MissingFieldName.exception("Missing attribute field-name on part element") if field_name.empty?
|
135
|
+
|
136
|
+
@project[:parts] << {field_name: field_name}
|
137
|
+
end
|
138
|
+
|
139
|
+
@project[:parts].each do |part|
|
140
|
+
@project[:data_sources].each do |data_source|
|
141
|
+
unless data_source[:fields].map{|e| e[:name]}.include?(part[:field_name])
|
142
|
+
raise MissingPartFieldNameOnDataSource.exception("Field name '#{part[:field_name]}' not found on data source '#{data_source[:id]}'")
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
def parse_output
|
149
|
+
output = @xml.xpath("/project/output")
|
150
|
+
|
151
|
+
case size = output.size
|
152
|
+
when 0
|
153
|
+
raise MissingOutputElement.exception("Missing output element")
|
154
|
+
when 1
|
155
|
+
else
|
156
|
+
raise MultipleOutputElement.exception("Only one output element is allowed, #{size} found")
|
157
|
+
end
|
158
|
+
|
159
|
+
@project[:output] = output.first['deterministic']
|
160
|
+
raise MissingDeterministicAttribute.exception("Missing attribute 'deterministic' on output element") unless @project[:output]
|
161
|
+
end
|
162
|
+
|
163
|
+
def parse
|
164
|
+
parse_xml
|
165
|
+
parse_task
|
166
|
+
parse_data_sources
|
167
|
+
parse_deterministic_linkage
|
168
|
+
parse_output
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'bundler/setup'
|
2
|
+
require 'rbczmq'
|
3
|
+
require_relative 'heartbeat'
|
4
|
+
require 'pry'
|
5
|
+
|
6
|
+
class Reader
|
7
|
+
BLOCKER_PORT_START = 2000
|
8
|
+
CTX = ZMQ::Context.new
|
9
|
+
|
10
|
+
def initialize(num_readers:, num_blocks:, rank:, file:, debug: false, addr: "tcp://localhost")
|
11
|
+
@num_readers = num_readers
|
12
|
+
@num_blocks = num_blocks
|
13
|
+
@rank = rank
|
14
|
+
@file = file
|
15
|
+
@debug = debug
|
16
|
+
@addr = addr
|
17
|
+
end
|
18
|
+
|
19
|
+
def start
|
20
|
+
start_connections
|
21
|
+
read_database
|
22
|
+
end
|
23
|
+
|
24
|
+
def start_connections
|
25
|
+
|
26
|
+
@connections = []
|
27
|
+
|
28
|
+
0.upto(@num_blocks - 1) do |i|
|
29
|
+
port = BLOCKER_PORT_START + i
|
30
|
+
Heartbeat.check(addr: "#{@addr}:#{2 * port}", ctx: CTX, type: 'client', debug: @debug)
|
31
|
+
|
32
|
+
@connections << create_socket(port)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def create_socket(port)
|
37
|
+
socket = CTX.socket(:DEALER)
|
38
|
+
socket.verbose = @debug
|
39
|
+
socket.connect("#{@addr}:#{port}")
|
40
|
+
return socket
|
41
|
+
end
|
42
|
+
|
43
|
+
def read_database
|
44
|
+
i = -1
|
45
|
+
File.open(@file,'r').each_line do |line|
|
46
|
+
i += 1
|
47
|
+
next if i % @num_readers != @rank
|
48
|
+
|
49
|
+
pair = create_pair(line.strip)
|
50
|
+
send_pair(pair)
|
51
|
+
end
|
52
|
+
close_connections
|
53
|
+
puts "EOF"
|
54
|
+
end
|
55
|
+
|
56
|
+
def close_connections
|
57
|
+
@connections.each do |c|
|
58
|
+
c.send("EOF")
|
59
|
+
c.close
|
60
|
+
end
|
61
|
+
CTX.destroy
|
62
|
+
end
|
63
|
+
|
64
|
+
def send_pair(pair)
|
65
|
+
block_id = which_block(pair.last)
|
66
|
+
|
67
|
+
@connections[block_id].send(pair.join(','))
|
68
|
+
end
|
69
|
+
|
70
|
+
def create_hash(input)
|
71
|
+
input.join
|
72
|
+
end
|
73
|
+
|
74
|
+
def which_block(key)
|
75
|
+
key.bytes.first % @num_blocks
|
76
|
+
end
|
77
|
+
|
78
|
+
def create_pair(input)
|
79
|
+
line = input.split(',')
|
80
|
+
[line.first, create_hash(line[1..-1])]
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
Reader.new(
|
85
|
+
debug: false,
|
86
|
+
num_readers: 2,
|
87
|
+
num_blocks: 1,
|
88
|
+
rank: ARGV[0].to_i,
|
89
|
+
file: (ARGV[1] || "input/10m.csv"),
|
90
|
+
addr: (ARGV[2] || "tcp://localhost")
|
91
|
+
).start
|
data/rpareia.gemspec
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require File.join([File.dirname(__FILE__),'lib','rpareia','version.rb'])
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = "rpareia"
|
5
|
+
s.description = "A data deduplication software, based on Pareia"
|
6
|
+
s.version = Rpareia::VERSION
|
7
|
+
s.authors = ["Michel Boaventura"]
|
8
|
+
s.email = ["michel.boaventura@gmail.com"]
|
9
|
+
s.homepage = "http://github.com/michelboaventura/rpareia"
|
10
|
+
s.platform = Gem::Platform::RUBY
|
11
|
+
s.summary = %q{Pareia implementation, with ruby and ZeroMQ}
|
12
|
+
s.files = `git ls-files -z`.split("\x0")
|
13
|
+
s.license = "MIT"
|
14
|
+
s.require_paths = ["lib"]
|
15
|
+
s.bindir = 'bin'
|
16
|
+
s.executables = ['rpareia']
|
17
|
+
s.test_files = s.files.grep(%r{^(spec)/})
|
18
|
+
|
19
|
+
s.add_development_dependency "bundler", "~> 1.7"
|
20
|
+
s.add_development_dependency "rake", "~> 10.0"
|
21
|
+
s.add_development_dependency "pry", "~> 0.10"
|
22
|
+
s.add_development_dependency "rspec", "~> 3.2"
|
23
|
+
s.add_development_dependency "guard-rspec", "~> 4.5"
|
24
|
+
|
25
|
+
s.add_dependency "nokogiri", "~> 1.6"
|
26
|
+
s.add_dependency "rbczmq", "~> 1.7"
|
27
|
+
s.add_dependency "gli", "~> 2.12"
|
28
|
+
end
|
File without changes
|
@@ -0,0 +1,215 @@
|
|
1
|
+
include Rpareia
|
2
|
+
|
3
|
+
DEFAULT_FILE = "spec/fixtures/empty_file.xml"
|
4
|
+
|
5
|
+
def xml_dedup(task: 'deduplication', ids: [0], files: [DEFAULT_FILE], types: ['delimited'], field_separators: ["\t"], fields: [[{name: 'id', type: 'int'}]], parts: [{:'field-name' => 'id'}], deterministic: true, conjunction: 1, output_deterministic: ['foo'])
|
6
|
+
xml(task: task, ids: ids, files: files, types: types, field_separators: field_separators, fields: fields, parts: parts, deterministic: deterministic, conjunction: conjunction, output_deterministic: output_deterministic)
|
7
|
+
end
|
8
|
+
|
9
|
+
def xml_linkage(task: 'linkage', ids: [0,1], files: [DEFAULT_FILE, DEFAULT_FILE], types: ['delimited', 'delimited'], field_separators: ["\t", "\t"], fields: [[{name: 'id', type: 'int'}], [{name: 'id', type: 'int'}]], parts: [{:'field-name' => 'id'}], deterministic: true, conjunction: 1, output_deterministic: ['foo'])
|
10
|
+
|
11
|
+
xml(task: task, ids: ids, files: files, types: types, field_separators: field_separators, fields: fields, parts: parts, deterministic: deterministic, conjunction: conjunction, output_deterministic: output_deterministic)
|
12
|
+
end
|
13
|
+
|
14
|
+
def xml(task:, ids: [], files: nil, types: nil, field_separators: nil, fields: nil, parts: nil, deterministic: true, conjunction: 1, output_deterministic: ['foo'])
|
15
|
+
Nokogiri::XML::Builder.new do |xml|
|
16
|
+
xml.project(task: task) {
|
17
|
+
xml.send('data-sources') {
|
18
|
+
ids.each_with_index do |id,i|
|
19
|
+
xml.send('data-source', id: id, file: files[i], type: types[i], :'field-separator' => field_separators[i]) {
|
20
|
+
xml.fields {
|
21
|
+
next unless fields[i]
|
22
|
+
fields[i].each do |field|
|
23
|
+
xml.field(field)
|
24
|
+
end
|
25
|
+
}
|
26
|
+
}
|
27
|
+
end
|
28
|
+
}
|
29
|
+
if deterministic
|
30
|
+
xml.send('deterministic-linkage') {
|
31
|
+
conjunction.times do
|
32
|
+
xml.conjunction {
|
33
|
+
parts.each do |part|
|
34
|
+
xml.part(part)
|
35
|
+
end
|
36
|
+
}
|
37
|
+
end
|
38
|
+
}
|
39
|
+
end
|
40
|
+
if output_deterministic
|
41
|
+
if output_deterministic.empty?
|
42
|
+
xml.output
|
43
|
+
else
|
44
|
+
output_deterministic.each do |out|
|
45
|
+
xml.output(deterministic: out)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
}
|
50
|
+
end.to_xml
|
51
|
+
end
|
52
|
+
|
53
|
+
RSpec.describe Parser do
|
54
|
+
context "initialization" do
|
55
|
+
it "expects a xml string as argument" do
|
56
|
+
expect {Parser.new}.to raise_error(ArgumentError)
|
57
|
+
end
|
58
|
+
it "complains about invalid XML" do
|
59
|
+
bad_xml = '<?xml version="1.0 encoding="UTF-8"?>'
|
60
|
+
expect {Parser.new(bad_xml)}.to raise_error(Parser::SyntaxError)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
context "tasks" do
|
65
|
+
it "accepts only valid tasks" do
|
66
|
+
xml = xml_dedup
|
67
|
+
expect {Parser.new(xml)}.to_not raise_error
|
68
|
+
|
69
|
+
xml = xml_linkage
|
70
|
+
expect {Parser.new(xml)}.to_not raise_error
|
71
|
+
end
|
72
|
+
|
73
|
+
it "rejects invalid tasks" do
|
74
|
+
xml = xml_dedup(task: 'foo')
|
75
|
+
expect {Parser.new(xml)}.to raise_error(Parser::InvalidTaskError, "Invalid task: 'foo'")
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
context "data source" do
|
80
|
+
it "accepts only one data sources when deduplicating" do
|
81
|
+
xml = xml_linkage(task: 'deduplication')
|
82
|
+
expect {Parser.new(xml)}.to raise_error(Parser::InvalidNumberOfSources, "Deduplication: expected one data-sources, 2 given")
|
83
|
+
end
|
84
|
+
|
85
|
+
it "accepts only two data sources when linking" do
|
86
|
+
xml = xml_dedup(task: 'linkage')
|
87
|
+
expect {Parser.new(xml)}.to raise_error(Parser::InvalidNumberOfSources, "Linkage: expected two data-source, 1 given")
|
88
|
+
end
|
89
|
+
|
90
|
+
it "rejects data sources without id" do
|
91
|
+
xml = xml_dedup(ids: [nil])
|
92
|
+
expect {Parser.new(xml)}.to raise_error(Parser::MissingDataSourceId)
|
93
|
+
end
|
94
|
+
|
95
|
+
it "rejects two data sources with same id" do
|
96
|
+
xml = xml_linkage(ids: [1,1])
|
97
|
+
expect {Parser.new(xml)}.to raise_error(Parser::DuplicatedDataSourceId, "Duplicated data source id '1'")
|
98
|
+
end
|
99
|
+
|
100
|
+
it "expects a file name" do
|
101
|
+
xml = xml_dedup(files: [''])
|
102
|
+
expect {Parser.new(xml)}.to raise_error(Parser::MissingDataSourceFile, "Missing file attribute from data source '0'")
|
103
|
+
end
|
104
|
+
|
105
|
+
it "rejects invalid file names" do
|
106
|
+
xml = xml_dedup(files: ['invalid'])
|
107
|
+
expect {Parser.new(xml)}.to raise_error(Parser::InvalidDataSourceFile, "File 'invalid' from data source '0' does not exist")
|
108
|
+
end
|
109
|
+
|
110
|
+
it "accepts only 'delimiter' type" do
|
111
|
+
xml = xml_dedup(types: ['delimited'])
|
112
|
+
expect {Parser.new(xml)}.to_not raise_error
|
113
|
+
|
114
|
+
xml = xml_dedup(types: ['foo'])
|
115
|
+
expect {Parser.new(xml)}.to raise_error(Parser::InvalidDataSourceType, "Data source type 'foo' not supported")
|
116
|
+
end
|
117
|
+
|
118
|
+
it "expects a field-separator" do
|
119
|
+
xml = xml_dedup(field_separators: ["\t"])
|
120
|
+
expect {Parser.new(xml)}.to_not raise_error
|
121
|
+
|
122
|
+
xml = xml_dedup(field_separators: [""])
|
123
|
+
expect {Parser.new(xml)}.to raise_error(Parser::MissingFieldSeparator, "Missing field separator from data source '0'")
|
124
|
+
end
|
125
|
+
|
126
|
+
it "expects a 'fields' entry" do
|
127
|
+
xml = xml_dedup(fields: [[{name: 'id', type: 'int'}]], parts: [{:'field-name' => 'id'}])
|
128
|
+
expect {Parser.new(xml)}.to_not raise_error
|
129
|
+
|
130
|
+
xml = xml_dedup(fields: [])
|
131
|
+
expect {Parser.new(xml)}.to raise_error(Parser::FieldsElementNotFound)
|
132
|
+
end
|
133
|
+
|
134
|
+
it "expect a name attribute on field element" do
|
135
|
+
xml = xml_dedup(fields: [[{name: '', type: 'int'}]])
|
136
|
+
expect {Parser.new(xml)}.to raise_error(Parser::MissingFieldName)
|
137
|
+
end
|
138
|
+
|
139
|
+
it "rejects duplicated names on the same data source" do
|
140
|
+
xml = xml_dedup(fields: [[{name: 'foo', type: 'int'},{name: 'foo', type: 'int'}]])
|
141
|
+
expect {Parser.new(xml)}.to raise_error(Parser::DuplicatedFieldName, "Duplicated field name 'foo' on data source '0'")
|
142
|
+
end
|
143
|
+
|
144
|
+
it "expect a valid field type" do
|
145
|
+
['int', 'string'].each do |type|
|
146
|
+
xml = xml_dedup(fields: [[{name: 'foo', type: type}]], parts: [{:'field-name' => 'foo'}])
|
147
|
+
expect {Parser.new(xml)}.to_not raise_error
|
148
|
+
end
|
149
|
+
|
150
|
+
xml = xml_dedup(fields: [[{name: 'foo', type: 'bar'}]])
|
151
|
+
expect {Parser.new(xml)}.to raise_error(Parser::InvalidFieldType, "Invalid type 'bar' from field 'foo', data source '0'")
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
context "deterministic" do
|
156
|
+
it "expects a deterministic-linkage element" do
|
157
|
+
xml = xml_dedup(deterministic: false)
|
158
|
+
expect {Parser.new(xml)}.to raise_error(Parser::DeterministicLinkageElementNotFound, "Missing deterministic-linkage element")
|
159
|
+
end
|
160
|
+
|
161
|
+
it "expects a single conjunction element" do
|
162
|
+
xml = xml_dedup(conjunction: 0)
|
163
|
+
expect {Parser.new(xml)}.to raise_error(Parser::ConjunctionElementNotFound, "Missing conjunction element")
|
164
|
+
|
165
|
+
xml = xml_dedup(conjunction: 1)
|
166
|
+
expect {Parser.new(xml)}.to_not raise_error
|
167
|
+
|
168
|
+
conjunction = 2
|
169
|
+
xml = xml_dedup(conjunction: conjunction)
|
170
|
+
expect {Parser.new(xml)}.to raise_error(Parser::MultipleConjunctionElements, "Only one conjunction element is allowed, #{conjunction} found")
|
171
|
+
end
|
172
|
+
|
173
|
+
it "expects at leas one part element" do
|
174
|
+
xml = xml_dedup(fields: [[{name: 'foo', type: 'int'}]], parts: [{:'field-name' => 'foo'}])
|
175
|
+
expect {Parser.new(xml)}.to_not raise_error
|
176
|
+
|
177
|
+
xml = xml_dedup(parts: [])
|
178
|
+
expect {Parser.new(xml)}.to raise_error(Parser::MissingPart, "At leas one part element is required")
|
179
|
+
end
|
180
|
+
|
181
|
+
it "expects a field-name on part elements" do
|
182
|
+
xml = xml_dedup(fields: [[{name: 'foo', type: 'string'}]], parts: [{:'field-name' => 'foo'}])
|
183
|
+
expect {Parser.new(xml)}.to_not raise_error
|
184
|
+
|
185
|
+
xml = xml_dedup(parts: [{not_field_name: 'foo'}])
|
186
|
+
expect {Parser.new(xml)}.to raise_error(Parser::MissingFieldName, "Missing attribute field-name on part element")
|
187
|
+
end
|
188
|
+
|
189
|
+
it "accepts only field-names present on all data sources" do
|
190
|
+
xml = xml_linkage(fields: [[{name: 'foo', type: 'string'}],[{name: 'foo', type: 'string'}]], parts: [{:'field-name' => 'foo'}])
|
191
|
+
expect {Parser.new(xml)}.to_not raise_error
|
192
|
+
|
193
|
+
xml = xml_linkage(fields: [[{name: 'foo', type: 'string'}],[{name: 'bar', type: 'string'}]], parts: [{:'field-name' => 'foo'}])
|
194
|
+
expect {Parser.new(xml)}.to raise_error(Parser::MissingPartFieldNameOnDataSource, "Field name 'foo' not found on data source '1'")
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
context "output" do
|
199
|
+
it "expects a single 'output' element" do
|
200
|
+
xml = xml_dedup(output_deterministic: ["foo"])
|
201
|
+
expect {Parser.new(xml)}.to_not raise_error
|
202
|
+
|
203
|
+
xml = xml_dedup(output_deterministic: ["foo","bar"])
|
204
|
+
expect {Parser.new(xml)}.to raise_error(Parser::MultipleOutputElement, "Only one output element is allowed, 2 found")
|
205
|
+
|
206
|
+
xml = xml_dedup(output_deterministic: false)
|
207
|
+
expect {Parser.new(xml)}.to raise_error(Parser::MissingOutputElement, "Missing output element")
|
208
|
+
end
|
209
|
+
|
210
|
+
it "expects a deterministic attribute on output element" do
|
211
|
+
xml = xml_dedup(output_deterministic: "")
|
212
|
+
expect {Parser.new(xml)}.to raise_error(Parser::MissingDeterministicAttribute, "Missing attribute 'deterministic' on output element")
|
213
|
+
end
|
214
|
+
end
|
215
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'rpareia'
|
2
|
+
|
3
|
+
RSpec.configure do |config|
|
4
|
+
config.expect_with :rspec do |expectations|
|
5
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
6
|
+
end
|
7
|
+
|
8
|
+
config.disable_monkey_patching!
|
9
|
+
|
10
|
+
#config.warnings = true
|
11
|
+
|
12
|
+
if config.files_to_run.one?
|
13
|
+
config.default_formatter = 'doc'
|
14
|
+
end
|
15
|
+
|
16
|
+
#config.profile_examples = 10
|
17
|
+
|
18
|
+
config.order = :random
|
19
|
+
|
20
|
+
Kernel.srand config.seed
|
21
|
+
end
|
metadata
ADDED
@@ -0,0 +1,179 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rpareia
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Michel Boaventura
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-02-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.7'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: pry
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0.10'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0.10'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '3.2'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '3.2'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: guard-rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '4.5'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '4.5'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: nokogiri
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '1.6'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '1.6'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rbczmq
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '1.7'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '1.7'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: gli
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - "~>"
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.12'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - "~>"
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '2.12'
|
125
|
+
description: A data deduplication software, based on Pareia
|
126
|
+
email:
|
127
|
+
- michel.boaventura@gmail.com
|
128
|
+
executables:
|
129
|
+
- rpareia
|
130
|
+
extensions: []
|
131
|
+
extra_rdoc_files: []
|
132
|
+
files:
|
133
|
+
- ".rspec"
|
134
|
+
- ".travis.yml"
|
135
|
+
- Gemfile
|
136
|
+
- Gemfile.lock
|
137
|
+
- Guardfile
|
138
|
+
- LICENSE.txt
|
139
|
+
- README.md
|
140
|
+
- Rakefile
|
141
|
+
- bin/rpareia
|
142
|
+
- lib/rpareia.rb
|
143
|
+
- lib/rpareia/blocker.rb
|
144
|
+
- lib/rpareia/heartbeat.rb
|
145
|
+
- lib/rpareia/parser.rb
|
146
|
+
- lib/rpareia/reader.rb
|
147
|
+
- lib/rpareia/version.rb
|
148
|
+
- rpareia.gemspec
|
149
|
+
- spec/fixtures/empty_file.xml
|
150
|
+
- spec/rpareia/parser_spec.rb
|
151
|
+
- spec/spec_helper.rb
|
152
|
+
homepage: http://github.com/michelboaventura/rpareia
|
153
|
+
licenses:
|
154
|
+
- MIT
|
155
|
+
metadata: {}
|
156
|
+
post_install_message:
|
157
|
+
rdoc_options: []
|
158
|
+
require_paths:
|
159
|
+
- lib
|
160
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
161
|
+
requirements:
|
162
|
+
- - ">="
|
163
|
+
- !ruby/object:Gem::Version
|
164
|
+
version: '0'
|
165
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
166
|
+
requirements:
|
167
|
+
- - ">="
|
168
|
+
- !ruby/object:Gem::Version
|
169
|
+
version: '0'
|
170
|
+
requirements: []
|
171
|
+
rubyforge_project:
|
172
|
+
rubygems_version: 2.4.5
|
173
|
+
signing_key:
|
174
|
+
specification_version: 4
|
175
|
+
summary: Pareia implementation, with ruby and ZeroMQ
|
176
|
+
test_files:
|
177
|
+
- spec/fixtures/empty_file.xml
|
178
|
+
- spec/rpareia/parser_spec.rb
|
179
|
+
- spec/spec_helper.rb
|