bmg 0.17.8 → 0.18.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +0 -3
- data/README.md +236 -57
- data/lib/bmg.rb +6 -0
- data/lib/bmg/algebra.rb +1 -0
- data/lib/bmg/algebra/shortcuts.rb +14 -0
- data/lib/bmg/operator/allbut.rb +27 -0
- data/lib/bmg/operator/autosummarize.rb +27 -4
- data/lib/bmg/operator/autowrap.rb +27 -0
- data/lib/bmg/operator/constants.rb +7 -0
- data/lib/bmg/operator/extend.rb +7 -0
- data/lib/bmg/operator/group.rb +1 -0
- data/lib/bmg/operator/image.rb +41 -2
- data/lib/bmg/operator/join.rb +1 -0
- data/lib/bmg/operator/matching.rb +1 -0
- data/lib/bmg/operator/not_matching.rb +1 -0
- data/lib/bmg/operator/page.rb +2 -7
- data/lib/bmg/operator/project.rb +3 -2
- data/lib/bmg/operator/rename.rb +12 -5
- data/lib/bmg/operator/restrict.rb +1 -0
- data/lib/bmg/operator/rxmatch.rb +1 -0
- data/lib/bmg/operator/summarize.rb +2 -17
- data/lib/bmg/operator/transform.rb +39 -1
- data/lib/bmg/operator/union.rb +1 -0
- data/lib/bmg/reader.rb +1 -0
- data/lib/bmg/reader/csv.rb +29 -10
- data/lib/bmg/reader/excel.rb +23 -4
- data/lib/bmg/reader/text_file.rb +56 -0
- data/lib/bmg/relation.rb +28 -0
- data/lib/bmg/relation/empty.rb +4 -0
- data/lib/bmg/relation/in_memory.rb +10 -1
- data/lib/bmg/relation/materialized.rb +6 -0
- data/lib/bmg/relation/spied.rb +6 -1
- data/lib/bmg/sequel/relation.rb +5 -0
- data/lib/bmg/sql/relation.rb +2 -3
- data/lib/bmg/summarizer.rb +29 -1
- data/lib/bmg/summarizer/avg.rb +3 -3
- data/lib/bmg/summarizer/by_proc.rb +41 -0
- data/lib/bmg/summarizer/distinct.rb +36 -0
- data/lib/bmg/summarizer/multiple.rb +46 -0
- data/lib/bmg/summarizer/percentile.rb +79 -0
- data/lib/bmg/support.rb +1 -0
- data/lib/bmg/support/ordering.rb +20 -0
- data/lib/bmg/support/tuple_algebra.rb +6 -0
- data/lib/bmg/support/tuple_transformer.rb +14 -6
- data/lib/bmg/version.rb +2 -2
- data/lib/bmg/writer.rb +16 -0
- data/lib/bmg/writer/csv.rb +0 -11
- data/lib/bmg/writer/xlsx.rb +68 -0
- data/tasks/test.rake +9 -2
- metadata +36 -15
data/lib/bmg/operator/union.rb
CHANGED
data/lib/bmg/reader.rb
CHANGED
data/lib/bmg/reader/csv.rb
CHANGED
@@ -5,30 +5,36 @@ module Bmg
|
|
5
5
|
|
6
6
|
DEFAULT_OPTIONS = {
|
7
7
|
:headers => true,
|
8
|
-
:return_headers => false
|
8
|
+
:return_headers => false,
|
9
|
+
:smart => true
|
9
10
|
}
|
10
11
|
|
11
|
-
def initialize(type,
|
12
|
+
def initialize(type, path_or_io, options = {})
|
12
13
|
@type = type
|
13
|
-
@
|
14
|
+
@path_or_io = path_or_io
|
14
15
|
@options = DEFAULT_OPTIONS.merge(options)
|
15
|
-
@options[:
|
16
|
-
|
16
|
+
if @options[:smart] && !@path_or_io.is_a?(IO)
|
17
|
+
@options[:col_sep] ||= infer_col_sep
|
18
|
+
@options[:quote_char] ||= infer_quote_char
|
19
|
+
end
|
17
20
|
end
|
18
21
|
|
19
22
|
def each
|
23
|
+
return to_enum unless block_given?
|
20
24
|
require 'csv'
|
21
|
-
|
22
|
-
|
25
|
+
with_io do |io|
|
26
|
+
::CSV.new(io, csv_options).each do |row|
|
27
|
+
yield tuple(row)
|
28
|
+
end
|
23
29
|
end
|
24
30
|
end
|
25
31
|
|
26
32
|
def to_ast
|
27
|
-
[ :csv, @
|
33
|
+
[ :csv, @path_or_io, @options ]
|
28
34
|
end
|
29
35
|
|
30
36
|
def to_s
|
31
|
-
"(csv #{
|
37
|
+
"(csv #{@path_or_io})"
|
32
38
|
end
|
33
39
|
alias :inspect :to_s
|
34
40
|
|
@@ -47,7 +53,16 @@ module Bmg
|
|
47
53
|
end
|
48
54
|
|
49
55
|
def text_portion
|
50
|
-
@text_portion ||=
|
56
|
+
@text_portion ||= with_io{|io| io.readlines(10).join("\n") }
|
57
|
+
end
|
58
|
+
|
59
|
+
def with_io(&bl)
|
60
|
+
case @path_or_io
|
61
|
+
when IO, StringIO
|
62
|
+
bl.call(@path_or_io)
|
63
|
+
else
|
64
|
+
File.open(@path_or_io, "r", &bl)
|
65
|
+
end
|
51
66
|
end
|
52
67
|
|
53
68
|
# Finds the best candidate among `candidates` for a separator
|
@@ -61,6 +76,10 @@ module Bmg
|
|
61
76
|
snif.size > 0 ? snif[0][0] : default
|
62
77
|
end
|
63
78
|
|
79
|
+
def csv_options
|
80
|
+
@csv_options ||= @options.dup.tap{|opts| opts.delete(:smart) }
|
81
|
+
end
|
82
|
+
|
64
83
|
end # class Csv
|
65
84
|
end # module Reader
|
66
85
|
end # module Bmg
|
data/lib/bmg/reader/excel.rb
CHANGED
@@ -4,7 +4,8 @@ module Bmg
|
|
4
4
|
include Reader
|
5
5
|
|
6
6
|
DEFAULT_OPTIONS = {
|
7
|
-
skip: 0
|
7
|
+
skip: 0,
|
8
|
+
row_num: true
|
8
9
|
}
|
9
10
|
|
10
11
|
def initialize(type, path, options = {})
|
@@ -14,6 +15,7 @@ module Bmg
|
|
14
15
|
end
|
15
16
|
|
16
17
|
def each
|
18
|
+
return to_enum unless block_given?
|
17
19
|
require 'roo'
|
18
20
|
xlsx = Roo::Spreadsheet.open(@path, @options)
|
19
21
|
headers = nil
|
@@ -23,9 +25,13 @@ module Bmg
|
|
23
25
|
.each_with_index
|
24
26
|
.each do |row, i|
|
25
27
|
if i==0
|
26
|
-
headers = row.map
|
28
|
+
headers = row.map{|c| c.to_s.strip.to_sym }
|
27
29
|
else
|
28
|
-
|
30
|
+
init = init_tuple(i)
|
31
|
+
tuple = (0...headers.size)
|
32
|
+
.each_with_object(init){|i,t|
|
33
|
+
t[headers[i]] = row[i]
|
34
|
+
}
|
29
35
|
yield(tuple)
|
30
36
|
end
|
31
37
|
end
|
@@ -36,10 +42,23 @@ module Bmg
|
|
36
42
|
end
|
37
43
|
|
38
44
|
def to_s
|
39
|
-
"(excel #{path})"
|
45
|
+
"(excel #{@path})"
|
40
46
|
end
|
41
47
|
alias :inspect :to_s
|
42
48
|
|
49
|
+
private
|
50
|
+
|
51
|
+
def init_tuple(i)
|
52
|
+
case as = @options[:row_num]
|
53
|
+
when TrueClass
|
54
|
+
{ :row_num => i }
|
55
|
+
when FalseClass
|
56
|
+
{}
|
57
|
+
when Symbol
|
58
|
+
{ :"#{as}" => i }
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
43
62
|
end # class Excel
|
44
63
|
end # module Reader
|
45
64
|
end # module Bmg
|
@@ -0,0 +1,56 @@
|
|
1
|
+
module Bmg
|
2
|
+
module Reader
|
3
|
+
class TextFile
|
4
|
+
include Reader
|
5
|
+
|
6
|
+
DEFAULT_OPTIONS = {
|
7
|
+
strip: true,
|
8
|
+
parse: nil
|
9
|
+
}
|
10
|
+
|
11
|
+
def initialize(type, path, options = {})
|
12
|
+
options = { parse: options } if options.is_a?(Regexp)
|
13
|
+
@path = path
|
14
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
15
|
+
@type = infer_type(type)
|
16
|
+
end
|
17
|
+
attr_reader :path, :options
|
18
|
+
|
19
|
+
public # Relation
|
20
|
+
|
21
|
+
def each
|
22
|
+
path.each_line.each_with_index do |text, line|
|
23
|
+
text = text.strip if strip?
|
24
|
+
parsed = parse(text)
|
25
|
+
yield({line: 1+line}.merge(parsed)) if parsed
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def infer_type(base)
|
32
|
+
return base unless base == Bmg::Type::ANY
|
33
|
+
attr_list = if rx = options[:parse]
|
34
|
+
[:line] + rx.names.map(&:to_sym)
|
35
|
+
else
|
36
|
+
[:line, :text]
|
37
|
+
end
|
38
|
+
base
|
39
|
+
.with_attrlist(attr_list)
|
40
|
+
.with_keys([[:line]])
|
41
|
+
end
|
42
|
+
|
43
|
+
def strip?
|
44
|
+
options[:strip]
|
45
|
+
end
|
46
|
+
|
47
|
+
def parse(text)
|
48
|
+
return { text: text } unless rx = options[:parse]
|
49
|
+
if match = rx.match(text)
|
50
|
+
TupleAlgebra.symbolize_keys(match.named_captures)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end # class TextFile
|
55
|
+
end # module Reader
|
56
|
+
end # module Bmg
|
data/lib/bmg/relation.rb
CHANGED
@@ -17,6 +17,22 @@ module Bmg
|
|
17
17
|
self
|
18
18
|
end
|
19
19
|
|
20
|
+
def type
|
21
|
+
Bmg::Type::ANY
|
22
|
+
end
|
23
|
+
|
24
|
+
def with_type(type)
|
25
|
+
dup.tap{|r|
|
26
|
+
r.type = type
|
27
|
+
}
|
28
|
+
end
|
29
|
+
|
30
|
+
def with_type_attrlist
|
31
|
+
return self if type.knows_attrlist?
|
32
|
+
attrs = self.first.keys
|
33
|
+
with_type(type.with_attrlist(attrs))
|
34
|
+
end
|
35
|
+
|
20
36
|
def with_typecheck
|
21
37
|
dup.tap{|r|
|
22
38
|
r.type = r.type.with_typecheck
|
@@ -100,6 +116,18 @@ module Bmg
|
|
100
116
|
end
|
101
117
|
end
|
102
118
|
|
119
|
+
def count
|
120
|
+
if type.knows_keys?
|
121
|
+
project(type.keys.first)._count
|
122
|
+
else
|
123
|
+
self._count
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def _count
|
128
|
+
to_a.size
|
129
|
+
end
|
130
|
+
|
103
131
|
# Returns a json representation
|
104
132
|
def to_json(*args, &bl)
|
105
133
|
to_a.to_json(*args, &bl)
|
data/lib/bmg/relation/empty.rb
CHANGED
@@ -8,7 +8,6 @@ module Bmg
|
|
8
8
|
@type = type
|
9
9
|
end
|
10
10
|
attr_accessor :type
|
11
|
-
protected :type=
|
12
11
|
attr_reader :operand
|
13
12
|
|
14
13
|
public
|
@@ -17,6 +16,16 @@ module Bmg
|
|
17
16
|
@operand.each(&bl)
|
18
17
|
end
|
19
18
|
|
19
|
+
def _count
|
20
|
+
if operand.respond_to?(:count)
|
21
|
+
operand.count
|
22
|
+
elsif operand.respond_to?(:size)
|
23
|
+
operand.size
|
24
|
+
else
|
25
|
+
super
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
20
29
|
def to_ast
|
21
30
|
[ :in_memory, operand ]
|
22
31
|
end
|
data/lib/bmg/relation/spied.rb
CHANGED
@@ -24,10 +24,15 @@ module Bmg
|
|
24
24
|
protected :type=
|
25
25
|
|
26
26
|
def each(&bl)
|
27
|
-
spy.call(self)
|
27
|
+
spy.call(self) if bl
|
28
28
|
operand.each(&bl)
|
29
29
|
end
|
30
30
|
|
31
|
+
def count
|
32
|
+
spy.call(self) if bl
|
33
|
+
operand.count
|
34
|
+
end
|
35
|
+
|
31
36
|
def to_ast
|
32
37
|
[ :spied, operand.to_ast, spy ]
|
33
38
|
end
|
data/lib/bmg/sequel/relation.rb
CHANGED
@@ -9,6 +9,7 @@ module Bmg
|
|
9
9
|
attr_reader :sequel_db
|
10
10
|
|
11
11
|
def each(&bl)
|
12
|
+
return to_enum unless block_given?
|
12
13
|
dataset.each(&bl)
|
13
14
|
end
|
14
15
|
|
@@ -33,6 +34,10 @@ module Bmg
|
|
33
34
|
base_table.update(arg)
|
34
35
|
end
|
35
36
|
|
37
|
+
def _count
|
38
|
+
dataset.count
|
39
|
+
end
|
40
|
+
|
36
41
|
def to_ast
|
37
42
|
[:sequel, dataset.sql]
|
38
43
|
end
|
data/lib/bmg/sql/relation.rb
CHANGED
@@ -10,7 +10,6 @@ module Bmg
|
|
10
10
|
end
|
11
11
|
|
12
12
|
attr_accessor :type
|
13
|
-
protected :type=
|
14
13
|
|
15
14
|
protected
|
16
15
|
|
@@ -134,8 +133,8 @@ module Bmg
|
|
134
133
|
_instance(type, builder, expr)
|
135
134
|
end
|
136
135
|
|
137
|
-
def _summarize(type, by,
|
138
|
-
summarization =
|
136
|
+
def _summarize(type, by, defs)
|
137
|
+
summarization = ::Bmg::Summarizer.summarization(defs)
|
139
138
|
if can_compile_summarization?(summarization)
|
140
139
|
expr = before_use(self.expr)
|
141
140
|
expr = Processor::Summarize.new(by, summarization, builder).call(self.expr)
|
data/lib/bmg/summarizer.rb
CHANGED
@@ -50,6 +50,21 @@ module Bmg
|
|
50
50
|
end
|
51
51
|
end
|
52
52
|
|
53
|
+
# Converts some summarization definitions to a Hash of
|
54
|
+
# summarizers.
|
55
|
+
def self.summarization(defs)
|
56
|
+
Hash[defs.map{|k,v|
|
57
|
+
summarizer = case v
|
58
|
+
when Summarizer then v
|
59
|
+
when Symbol then Summarizer.send(v, k)
|
60
|
+
when Proc then Summarizer.by_proc(&v)
|
61
|
+
else
|
62
|
+
raise ArgumentError, "Unexpected summarizer #{k} => #{v}"
|
63
|
+
end
|
64
|
+
[ k, summarizer ]
|
65
|
+
}]
|
66
|
+
end
|
67
|
+
|
53
68
|
# Returns the default options to use
|
54
69
|
#
|
55
70
|
# @return the default aggregation options
|
@@ -80,7 +95,16 @@ module Bmg
|
|
80
95
|
# @param the current iterated tuple
|
81
96
|
# @return updated memo value
|
82
97
|
def happens(memo, tuple)
|
83
|
-
value =
|
98
|
+
value = case @functor
|
99
|
+
when Proc
|
100
|
+
@functor.call(tuple)
|
101
|
+
when NilClass
|
102
|
+
tuple
|
103
|
+
when Symbol
|
104
|
+
tuple[@functor]
|
105
|
+
else
|
106
|
+
tuple[@functor]
|
107
|
+
end
|
84
108
|
_happens(memo, value)
|
85
109
|
end
|
86
110
|
|
@@ -128,5 +152,9 @@ require_relative 'summarizer/max'
|
|
128
152
|
require_relative 'summarizer/avg'
|
129
153
|
require_relative 'summarizer/variance'
|
130
154
|
require_relative 'summarizer/stddev'
|
155
|
+
require_relative 'summarizer/percentile'
|
131
156
|
require_relative 'summarizer/collect'
|
157
|
+
require_relative 'summarizer/distinct'
|
132
158
|
require_relative 'summarizer/concat'
|
159
|
+
require_relative 'summarizer/by_proc'
|
160
|
+
require_relative 'summarizer/multiple'
|
data/lib/bmg/summarizer/avg.rb
CHANGED
@@ -16,13 +16,13 @@ module Bmg
|
|
16
16
|
end
|
17
17
|
|
18
18
|
# Collects one more value + the sum of all
|
19
|
-
def _happens(memo, val)
|
19
|
+
def _happens(memo, val)
|
20
20
|
[memo.first + val, memo.last + 1]
|
21
21
|
end
|
22
22
|
|
23
23
|
# Finalizes the computation.
|
24
|
-
def finalize(memo)
|
25
|
-
memo.first / memo.last
|
24
|
+
def finalize(memo)
|
25
|
+
memo.first / memo.last
|
26
26
|
end
|
27
27
|
|
28
28
|
end # class Avg
|
@@ -0,0 +1,41 @@
|
|
1
|
+
module Bmg
|
2
|
+
class Summarizer
|
3
|
+
#
|
4
|
+
# Generic summarizer that takes a Proc àla each_with_object.
|
5
|
+
#
|
6
|
+
# Example:
|
7
|
+
#
|
8
|
+
# # direct ruby usage
|
9
|
+
# Bmg::Summarizer.by_proc{|t,memo| ... }.summarize(...)
|
10
|
+
#
|
11
|
+
class ByProc < Summarizer
|
12
|
+
|
13
|
+
def initialize(least, by_proc)
|
14
|
+
@least = least
|
15
|
+
@by_proc = by_proc
|
16
|
+
end
|
17
|
+
|
18
|
+
# Returns [] as least value.
|
19
|
+
def least()
|
20
|
+
@least
|
21
|
+
end
|
22
|
+
|
23
|
+
# Adds val to the memo array
|
24
|
+
def happens(memo, val)
|
25
|
+
@by_proc.call(val, memo)
|
26
|
+
end
|
27
|
+
|
28
|
+
def finalize(memo)
|
29
|
+
memo
|
30
|
+
end
|
31
|
+
|
32
|
+
end # class ByProc
|
33
|
+
|
34
|
+
# Factors a distinct summarizer
|
35
|
+
def self.by_proc(least = nil, proc = nil, &bl)
|
36
|
+
least, proc = nil, least if least.is_a?(Proc)
|
37
|
+
ByProc.new(least, proc || bl)
|
38
|
+
end
|
39
|
+
|
40
|
+
end # class Summarizer
|
41
|
+
end # module Bmg
|