bmg 0.17.7 → 0.18.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -3
  3. data/README.md +236 -57
  4. data/lib/bmg.rb +6 -0
  5. data/lib/bmg/algebra.rb +1 -0
  6. data/lib/bmg/algebra/shortcuts.rb +14 -0
  7. data/lib/bmg/operator/allbut.rb +27 -0
  8. data/lib/bmg/operator/autosummarize.rb +27 -4
  9. data/lib/bmg/operator/autowrap.rb +27 -0
  10. data/lib/bmg/operator/constants.rb +7 -0
  11. data/lib/bmg/operator/extend.rb +7 -0
  12. data/lib/bmg/operator/group.rb +1 -0
  13. data/lib/bmg/operator/image.rb +41 -2
  14. data/lib/bmg/operator/join.rb +1 -0
  15. data/lib/bmg/operator/matching.rb +1 -0
  16. data/lib/bmg/operator/not_matching.rb +1 -0
  17. data/lib/bmg/operator/page.rb +2 -7
  18. data/lib/bmg/operator/project.rb +3 -2
  19. data/lib/bmg/operator/rename.rb +7 -0
  20. data/lib/bmg/operator/restrict.rb +1 -0
  21. data/lib/bmg/operator/rxmatch.rb +1 -0
  22. data/lib/bmg/operator/summarize.rb +2 -17
  23. data/lib/bmg/operator/transform.rb +39 -1
  24. data/lib/bmg/operator/union.rb +1 -0
  25. data/lib/bmg/reader.rb +1 -0
  26. data/lib/bmg/reader/csv.rb +29 -10
  27. data/lib/bmg/reader/excel.rb +22 -3
  28. data/lib/bmg/reader/text_file.rb +56 -0
  29. data/lib/bmg/relation.rb +31 -2
  30. data/lib/bmg/relation/empty.rb +4 -0
  31. data/lib/bmg/relation/in_memory.rb +10 -1
  32. data/lib/bmg/relation/materialized.rb +6 -0
  33. data/lib/bmg/relation/spied.rb +6 -1
  34. data/lib/bmg/sequel/relation.rb +5 -0
  35. data/lib/bmg/sql/relation.rb +2 -3
  36. data/lib/bmg/summarizer.rb +29 -1
  37. data/lib/bmg/summarizer/avg.rb +3 -3
  38. data/lib/bmg/summarizer/by_proc.rb +41 -0
  39. data/lib/bmg/summarizer/distinct.rb +36 -0
  40. data/lib/bmg/summarizer/multiple.rb +46 -0
  41. data/lib/bmg/summarizer/percentile.rb +47 -0
  42. data/lib/bmg/support.rb +2 -0
  43. data/lib/bmg/support/ordering.rb +20 -0
  44. data/lib/bmg/support/output_preferences.rb +44 -0
  45. data/lib/bmg/support/tuple_algebra.rb +6 -0
  46. data/lib/bmg/support/tuple_transformer.rb +4 -5
  47. data/lib/bmg/version.rb +2 -2
  48. data/lib/bmg/writer.rb +16 -0
  49. data/lib/bmg/writer/csv.rb +7 -7
  50. data/lib/bmg/writer/xlsx.rb +68 -0
  51. data/tasks/test.rake +9 -2
  52. metadata +37 -15
@@ -37,6 +37,7 @@ module Bmg
37
37
  end
38
38
 
39
39
  def each(&bl)
40
+ return to_enum unless block_given?
40
41
  if all?
41
42
  operands.each do |op|
42
43
  op.each(&bl)
data/lib/bmg/reader.rb CHANGED
@@ -9,3 +9,4 @@ module Bmg
9
9
  end
10
10
  require_relative "reader/csv"
11
11
  require_relative "reader/excel"
12
+ require_relative "reader/text_file"
@@ -5,30 +5,36 @@ module Bmg
5
5
 
6
6
  DEFAULT_OPTIONS = {
7
7
  :headers => true,
8
- :return_headers => false
8
+ :return_headers => false,
9
+ :smart => true
9
10
  }
10
11
 
11
- def initialize(type, path, options = {})
12
+ def initialize(type, path_or_io, options = {})
12
13
  @type = type
13
- @path = path
14
+ @path_or_io = path_or_io
14
15
  @options = DEFAULT_OPTIONS.merge(options)
15
- @options[:col_sep] ||= infer_col_sep
16
- @options[:quote_char] ||= infer_quote_char
16
+ if @options[:smart] && !@path_or_io.is_a?(IO)
17
+ @options[:col_sep] ||= infer_col_sep
18
+ @options[:quote_char] ||= infer_quote_char
19
+ end
17
20
  end
18
21
 
19
22
  def each
23
+ return to_enum unless block_given?
20
24
  require 'csv'
21
- ::CSV.foreach(@path, @options) do |row|
22
- yield tuple(row)
25
+ with_io do |io|
26
+ ::CSV.new(io, csv_options).each do |row|
27
+ yield tuple(row)
28
+ end
23
29
  end
24
30
  end
25
31
 
26
32
  def to_ast
27
- [ :csv, @path, @options ]
33
+ [ :csv, @path_or_io, @options ]
28
34
  end
29
35
 
30
36
  def to_s
31
- "(csv #{path})"
37
+ "(csv #{@path_or_io})"
32
38
  end
33
39
  alias :inspect :to_s
34
40
 
@@ -47,7 +53,16 @@ module Bmg
47
53
  end
48
54
 
49
55
  def text_portion
50
- @text_portion ||= File.foreach(@path).first(10).join("\n")
56
+ @text_portion ||= with_io{|io| io.readlines(10).join("\n") }
57
+ end
58
+
59
+ def with_io(&bl)
60
+ case @path_or_io
61
+ when IO, StringIO
62
+ bl.call(@path_or_io)
63
+ else
64
+ File.open(@path_or_io, "r", &bl)
65
+ end
51
66
  end
52
67
 
53
68
  # Finds the best candidate among `candidates` for a separator
@@ -61,6 +76,10 @@ module Bmg
61
76
  snif.size > 0 ? snif[0][0] : default
62
77
  end
63
78
 
79
+ def csv_options
80
+ @csv_options ||= @options.dup.tap{|opts| opts.delete(:smart) }
81
+ end
82
+
64
83
  end # class Csv
65
84
  end # module Reader
66
85
  end # module Bmg
@@ -4,7 +4,8 @@ module Bmg
4
4
  include Reader
5
5
 
6
6
  DEFAULT_OPTIONS = {
7
- skip: 0
7
+ skip: 0,
8
+ row_num: true
8
9
  }
9
10
 
10
11
  def initialize(type, path, options = {})
@@ -14,6 +15,7 @@ module Bmg
14
15
  end
15
16
 
16
17
  def each
18
+ return to_enum unless block_given?
17
19
  require 'roo'
18
20
  xlsx = Roo::Spreadsheet.open(@path, @options)
19
21
  headers = nil
@@ -25,7 +27,11 @@ module Bmg
25
27
  if i==0
26
28
  headers = row.map(&:to_sym)
27
29
  else
28
- tuple = (0...headers.size).each_with_object({}){|i,t| t[headers[i]] = row[i] }
30
+ init = init_tuple(i)
31
+ tuple = (0...headers.size)
32
+ .each_with_object(init){|i,t|
33
+ t[headers[i]] = row[i]
34
+ }
29
35
  yield(tuple)
30
36
  end
31
37
  end
@@ -36,10 +42,23 @@ module Bmg
36
42
  end
37
43
 
38
44
  def to_s
39
- "(excel #{path})"
45
+ "(excel #{@path})"
40
46
  end
41
47
  alias :inspect :to_s
42
48
 
49
+ private
50
+
51
+ def init_tuple(i)
52
+ case as = @options[:row_num]
53
+ when TrueClass
54
+ { :row_num => i }
55
+ when FalseClass
56
+ {}
57
+ when Symbol
58
+ { :"#{as}" => i }
59
+ end
60
+ end
61
+
43
62
  end # class Excel
44
63
  end # module Reader
45
64
  end # module Bmg
@@ -0,0 +1,56 @@
1
+ module Bmg
2
+ module Reader
3
+ class TextFile
4
+ include Reader
5
+
6
+ DEFAULT_OPTIONS = {
7
+ strip: true,
8
+ parse: nil
9
+ }
10
+
11
+ def initialize(type, path, options = {})
12
+ options = { parse: options } if options.is_a?(Regexp)
13
+ @path = path
14
+ @options = DEFAULT_OPTIONS.merge(options)
15
+ @type = infer_type(type)
16
+ end
17
+ attr_reader :path, :options
18
+
19
+ public # Relation
20
+
21
+ def each
22
+ path.each_line.each_with_index do |text, line|
23
+ text = text.strip if strip?
24
+ parsed = parse(text)
25
+ yield({line: 1+line}.merge(parsed)) if parsed
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def infer_type(base)
32
+ return base unless base == Bmg::Type::ANY
33
+ attr_list = if rx = options[:parse]
34
+ [:line] + rx.names.map(&:to_sym)
35
+ else
36
+ [:line, :text]
37
+ end
38
+ base
39
+ .with_attrlist(attr_list)
40
+ .with_keys([[:line]])
41
+ end
42
+
43
+ def strip?
44
+ options[:strip]
45
+ end
46
+
47
+ def parse(text)
48
+ return { text: text } unless rx = options[:parse]
49
+ if match = rx.match(text)
50
+ TupleAlgebra.symbolize_keys(match.named_captures)
51
+ end
52
+ end
53
+
54
+ end # class TextFile
55
+ end # module Reader
56
+ end # module Bmg
data/lib/bmg/relation.rb CHANGED
@@ -17,6 +17,22 @@ module Bmg
17
17
  self
18
18
  end
19
19
 
20
+ def type
21
+ Bmg::Type::ANY
22
+ end
23
+
24
+ def with_type(type)
25
+ dup.tap{|r|
26
+ r.type = type
27
+ }
28
+ end
29
+
30
+ def with_type_attrlist
31
+ return self if type.knows_attrlist?
32
+ attrs = self.first.keys
33
+ with_type(type.with_attrlist(attrs))
34
+ end
35
+
20
36
  def with_typecheck
21
37
  dup.tap{|r|
22
38
  r.type = r.type.with_typecheck
@@ -100,6 +116,18 @@ module Bmg
100
116
  end
101
117
  end
102
118
 
119
+ def count
120
+ if type.knows_keys?
121
+ project(type.keys.first)._count
122
+ else
123
+ self._count
124
+ end
125
+ end
126
+
127
+ def _count
128
+ to_a.size
129
+ end
130
+
103
131
  # Returns a json representation
104
132
  def to_json(*args, &bl)
105
133
  to_a.to_json(*args, &bl)
@@ -113,9 +141,10 @@ module Bmg
113
141
  # When no string_or_io is used, the method uses a string.
114
142
  #
115
143
  # The method always returns the string_or_io.
116
- def to_csv(options = {}, string_or_io = nil)
144
+ def to_csv(options = {}, string_or_io = nil, preferences = nil)
117
145
  options, string_or_io = {}, options unless options.is_a?(Hash)
118
- Writer::Csv.new(options).call(self, string_or_io)
146
+ string_or_io, preferences = nil, string_or_io if string_or_io.is_a?(Hash)
147
+ Writer::Csv.new(options, preferences).call(self, string_or_io)
119
148
  end
120
149
 
121
150
  # Converts to an sexpr expression.
@@ -19,6 +19,10 @@ module Bmg
19
19
  def each(&bl)
20
20
  end
21
21
 
22
+ def _count
23
+ 0
24
+ end
25
+
22
26
  def to_ast
23
27
  [ :empty ]
24
28
  end
@@ -8,7 +8,6 @@ module Bmg
8
8
  @type = type
9
9
  end
10
10
  attr_accessor :type
11
- protected :type=
12
11
  attr_reader :operand
13
12
 
14
13
  public
@@ -17,6 +16,16 @@ module Bmg
17
16
  @operand.each(&bl)
18
17
  end
19
18
 
19
+ def _count
20
+ if operand.respond_to?(:count)
21
+ operand.count
22
+ elsif operand.respond_to?(:size)
23
+ operand.size
24
+ else
25
+ super
26
+ end
27
+ end
28
+
20
29
  def to_ast
21
30
  [ :in_memory, operand ]
22
31
  end
@@ -16,6 +16,12 @@ module Bmg
16
16
  end
17
17
  protected :type=
18
18
 
19
+ public
20
+
21
+ def _count
22
+ operand._count
23
+ end
24
+
19
25
  public
20
26
 
21
27
  def each(&bl)
@@ -24,10 +24,15 @@ module Bmg
24
24
  protected :type=
25
25
 
26
26
  def each(&bl)
27
- spy.call(self)
27
+ spy.call(self) if bl
28
28
  operand.each(&bl)
29
29
  end
30
30
 
31
+ def count
32
+ spy.call(self) if bl
33
+ operand.count
34
+ end
35
+
31
36
  def to_ast
32
37
  [ :spied, operand.to_ast, spy ]
33
38
  end
@@ -9,6 +9,7 @@ module Bmg
9
9
  attr_reader :sequel_db
10
10
 
11
11
  def each(&bl)
12
+ return to_enum unless block_given?
12
13
  dataset.each(&bl)
13
14
  end
14
15
 
@@ -33,6 +34,10 @@ module Bmg
33
34
  base_table.update(arg)
34
35
  end
35
36
 
37
+ def _count
38
+ dataset.count
39
+ end
40
+
36
41
  def to_ast
37
42
  [:sequel, dataset.sql]
38
43
  end
@@ -10,7 +10,6 @@ module Bmg
10
10
  end
11
11
 
12
12
  attr_accessor :type
13
- protected :type=
14
13
 
15
14
  protected
16
15
 
@@ -134,8 +133,8 @@ module Bmg
134
133
  _instance(type, builder, expr)
135
134
  end
136
135
 
137
- def _summarize(type, by, summarization)
138
- summarization = Operator::Summarize.compile(summarization)
136
+ def _summarize(type, by, defs)
137
+ summarization = ::Bmg::Summarizer.summarization(defs)
139
138
  if can_compile_summarization?(summarization)
140
139
  expr = before_use(self.expr)
141
140
  expr = Processor::Summarize.new(by, summarization, builder).call(self.expr)
@@ -50,6 +50,21 @@ module Bmg
50
50
  end
51
51
  end
52
52
 
53
+ # Converts some summarization definitions to a Hash of
54
+ # summarizers.
55
+ def self.summarization(defs)
56
+ Hash[defs.map{|k,v|
57
+ summarizer = case v
58
+ when Summarizer then v
59
+ when Symbol then Summarizer.send(v, k)
60
+ when Proc then Summarizer.by_proc(&v)
61
+ else
62
+ raise ArgumentError, "Unexpected summarizer #{k} => #{v}"
63
+ end
64
+ [ k, summarizer ]
65
+ }]
66
+ end
67
+
53
68
  # Returns the default options to use
54
69
  #
55
70
  # @return the default aggregation options
@@ -80,7 +95,16 @@ module Bmg
80
95
  # @param the current iterated tuple
81
96
  # @return updated memo value
82
97
  def happens(memo, tuple)
83
- value = @functor.is_a?(Proc) ? @functor.call(tuple) : tuple[@functor]
98
+ value = case @functor
99
+ when Proc
100
+ @functor.call(tuple)
101
+ when NilClass
102
+ tuple
103
+ when Symbol
104
+ tuple[@functor]
105
+ else
106
+ tuple[@functor]
107
+ end
84
108
  _happens(memo, value)
85
109
  end
86
110
 
@@ -128,5 +152,9 @@ require_relative 'summarizer/max'
128
152
  require_relative 'summarizer/avg'
129
153
  require_relative 'summarizer/variance'
130
154
  require_relative 'summarizer/stddev'
155
+ require_relative 'summarizer/percentile'
131
156
  require_relative 'summarizer/collect'
157
+ require_relative 'summarizer/distinct'
132
158
  require_relative 'summarizer/concat'
159
+ require_relative 'summarizer/by_proc'
160
+ require_relative 'summarizer/multiple'
@@ -16,13 +16,13 @@ module Bmg
16
16
  end
17
17
 
18
18
  # Collects one more value + the sum of all
19
- def _happens(memo, val)
19
+ def _happens(memo, val)
20
20
  [memo.first + val, memo.last + 1]
21
21
  end
22
22
 
23
23
  # Finalizes the computation.
24
- def finalize(memo)
25
- memo.first / memo.last
24
+ def finalize(memo)
25
+ memo.first / memo.last
26
26
  end
27
27
 
28
28
  end # class Avg
@@ -0,0 +1,41 @@
1
+ module Bmg
2
+ class Summarizer
3
+ #
4
+ # Generic summarizer that takes a Proc àla each_with_object.
5
+ #
6
+ # Example:
7
+ #
8
+ # # direct ruby usage
9
+ # Bmg::Summarizer.by_proc{|t,memo| ... }.summarize(...)
10
+ #
11
+ class ByProc < Summarizer
12
+
13
+ def initialize(least, by_proc)
14
+ @least = least
15
+ @by_proc = by_proc
16
+ end
17
+
18
+ # Returns [] as least value.
19
+ def least()
20
+ @least
21
+ end
22
+
23
+ # Adds val to the memo array
24
+ def happens(memo, val)
25
+ @by_proc.call(val, memo)
26
+ end
27
+
28
+ def finalize(memo)
29
+ memo
30
+ end
31
+
32
+ end # class ByProc
33
+
34
+ # Factors a distinct summarizer
35
+ def self.by_proc(least = nil, proc = nil, &bl)
36
+ least, proc = nil, least if least.is_a?(Proc)
37
+ ByProc.new(least, proc || bl)
38
+ end
39
+
40
+ end # class Summarizer
41
+ end # module Bmg