bmg 0.17.7 → 0.18.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +0 -3
  3. data/README.md +236 -57
  4. data/lib/bmg.rb +6 -0
  5. data/lib/bmg/algebra.rb +1 -0
  6. data/lib/bmg/algebra/shortcuts.rb +14 -0
  7. data/lib/bmg/operator/allbut.rb +27 -0
  8. data/lib/bmg/operator/autosummarize.rb +27 -4
  9. data/lib/bmg/operator/autowrap.rb +27 -0
  10. data/lib/bmg/operator/constants.rb +7 -0
  11. data/lib/bmg/operator/extend.rb +7 -0
  12. data/lib/bmg/operator/group.rb +1 -0
  13. data/lib/bmg/operator/image.rb +41 -2
  14. data/lib/bmg/operator/join.rb +1 -0
  15. data/lib/bmg/operator/matching.rb +1 -0
  16. data/lib/bmg/operator/not_matching.rb +1 -0
  17. data/lib/bmg/operator/page.rb +2 -7
  18. data/lib/bmg/operator/project.rb +3 -2
  19. data/lib/bmg/operator/rename.rb +7 -0
  20. data/lib/bmg/operator/restrict.rb +1 -0
  21. data/lib/bmg/operator/rxmatch.rb +1 -0
  22. data/lib/bmg/operator/summarize.rb +2 -17
  23. data/lib/bmg/operator/transform.rb +39 -1
  24. data/lib/bmg/operator/union.rb +1 -0
  25. data/lib/bmg/reader.rb +1 -0
  26. data/lib/bmg/reader/csv.rb +29 -10
  27. data/lib/bmg/reader/excel.rb +22 -3
  28. data/lib/bmg/reader/text_file.rb +56 -0
  29. data/lib/bmg/relation.rb +31 -2
  30. data/lib/bmg/relation/empty.rb +4 -0
  31. data/lib/bmg/relation/in_memory.rb +10 -1
  32. data/lib/bmg/relation/materialized.rb +6 -0
  33. data/lib/bmg/relation/spied.rb +6 -1
  34. data/lib/bmg/sequel/relation.rb +5 -0
  35. data/lib/bmg/sql/relation.rb +2 -3
  36. data/lib/bmg/summarizer.rb +29 -1
  37. data/lib/bmg/summarizer/avg.rb +3 -3
  38. data/lib/bmg/summarizer/by_proc.rb +41 -0
  39. data/lib/bmg/summarizer/distinct.rb +36 -0
  40. data/lib/bmg/summarizer/multiple.rb +46 -0
  41. data/lib/bmg/summarizer/percentile.rb +47 -0
  42. data/lib/bmg/support.rb +2 -0
  43. data/lib/bmg/support/ordering.rb +20 -0
  44. data/lib/bmg/support/output_preferences.rb +44 -0
  45. data/lib/bmg/support/tuple_algebra.rb +6 -0
  46. data/lib/bmg/support/tuple_transformer.rb +4 -5
  47. data/lib/bmg/version.rb +2 -2
  48. data/lib/bmg/writer.rb +16 -0
  49. data/lib/bmg/writer/csv.rb +7 -7
  50. data/lib/bmg/writer/xlsx.rb +68 -0
  51. data/tasks/test.rake +9 -2
  52. metadata +37 -15
@@ -37,6 +37,7 @@ module Bmg
37
37
  end
38
38
 
39
39
  def each(&bl)
40
+ return to_enum unless block_given?
40
41
  if all?
41
42
  operands.each do |op|
42
43
  op.each(&bl)
data/lib/bmg/reader.rb CHANGED
@@ -9,3 +9,4 @@ module Bmg
9
9
  end
10
10
  require_relative "reader/csv"
11
11
  require_relative "reader/excel"
12
+ require_relative "reader/text_file"
@@ -5,30 +5,36 @@ module Bmg
5
5
 
6
6
  DEFAULT_OPTIONS = {
7
7
  :headers => true,
8
- :return_headers => false
8
+ :return_headers => false,
9
+ :smart => true
9
10
  }
10
11
 
11
- def initialize(type, path, options = {})
12
+ def initialize(type, path_or_io, options = {})
12
13
  @type = type
13
- @path = path
14
+ @path_or_io = path_or_io
14
15
  @options = DEFAULT_OPTIONS.merge(options)
15
- @options[:col_sep] ||= infer_col_sep
16
- @options[:quote_char] ||= infer_quote_char
16
+ if @options[:smart] && !@path_or_io.is_a?(IO)
17
+ @options[:col_sep] ||= infer_col_sep
18
+ @options[:quote_char] ||= infer_quote_char
19
+ end
17
20
  end
18
21
 
19
22
  def each
23
+ return to_enum unless block_given?
20
24
  require 'csv'
21
- ::CSV.foreach(@path, @options) do |row|
22
- yield tuple(row)
25
+ with_io do |io|
26
+ ::CSV.new(io, csv_options).each do |row|
27
+ yield tuple(row)
28
+ end
23
29
  end
24
30
  end
25
31
 
26
32
  def to_ast
27
- [ :csv, @path, @options ]
33
+ [ :csv, @path_or_io, @options ]
28
34
  end
29
35
 
30
36
  def to_s
31
- "(csv #{path})"
37
+ "(csv #{@path_or_io})"
32
38
  end
33
39
  alias :inspect :to_s
34
40
 
@@ -47,7 +53,16 @@ module Bmg
47
53
  end
48
54
 
49
55
  def text_portion
50
- @text_portion ||= File.foreach(@path).first(10).join("\n")
56
+ @text_portion ||= with_io{|io| io.readlines(10).join("\n") }
57
+ end
58
+
59
+ def with_io(&bl)
60
+ case @path_or_io
61
+ when IO, StringIO
62
+ bl.call(@path_or_io)
63
+ else
64
+ File.open(@path_or_io, "r", &bl)
65
+ end
51
66
  end
52
67
 
53
68
  # Finds the best candidate among `candidates` for a separator
@@ -61,6 +76,10 @@ module Bmg
61
76
  snif.size > 0 ? snif[0][0] : default
62
77
  end
63
78
 
79
+ def csv_options
80
+ @csv_options ||= @options.dup.tap{|opts| opts.delete(:smart) }
81
+ end
82
+
64
83
  end # class Csv
65
84
  end # module Reader
66
85
  end # module Bmg
@@ -4,7 +4,8 @@ module Bmg
4
4
  include Reader
5
5
 
6
6
  DEFAULT_OPTIONS = {
7
- skip: 0
7
+ skip: 0,
8
+ row_num: true
8
9
  }
9
10
 
10
11
  def initialize(type, path, options = {})
@@ -14,6 +15,7 @@ module Bmg
14
15
  end
15
16
 
16
17
  def each
18
+ return to_enum unless block_given?
17
19
  require 'roo'
18
20
  xlsx = Roo::Spreadsheet.open(@path, @options)
19
21
  headers = nil
@@ -25,7 +27,11 @@ module Bmg
25
27
  if i==0
26
28
  headers = row.map(&:to_sym)
27
29
  else
28
- tuple = (0...headers.size).each_with_object({}){|i,t| t[headers[i]] = row[i] }
30
+ init = init_tuple(i)
31
+ tuple = (0...headers.size)
32
+ .each_with_object(init){|i,t|
33
+ t[headers[i]] = row[i]
34
+ }
29
35
  yield(tuple)
30
36
  end
31
37
  end
@@ -36,10 +42,23 @@ module Bmg
36
42
  end
37
43
 
38
44
  def to_s
39
- "(excel #{path})"
45
+ "(excel #{@path})"
40
46
  end
41
47
  alias :inspect :to_s
42
48
 
49
+ private
50
+
51
+ def init_tuple(i)
52
+ case as = @options[:row_num]
53
+ when TrueClass
54
+ { :row_num => i }
55
+ when FalseClass
56
+ {}
57
+ when Symbol
58
+ { :"#{as}" => i }
59
+ end
60
+ end
61
+
43
62
  end # class Excel
44
63
  end # module Reader
45
64
  end # module Bmg
@@ -0,0 +1,56 @@
1
+ module Bmg
2
+ module Reader
3
+ class TextFile
4
+ include Reader
5
+
6
+ DEFAULT_OPTIONS = {
7
+ strip: true,
8
+ parse: nil
9
+ }
10
+
11
+ def initialize(type, path, options = {})
12
+ options = { parse: options } if options.is_a?(Regexp)
13
+ @path = path
14
+ @options = DEFAULT_OPTIONS.merge(options)
15
+ @type = infer_type(type)
16
+ end
17
+ attr_reader :path, :options
18
+
19
+ public # Relation
20
+
21
+ def each
22
+ path.each_line.each_with_index do |text, line|
23
+ text = text.strip if strip?
24
+ parsed = parse(text)
25
+ yield({line: 1+line}.merge(parsed)) if parsed
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def infer_type(base)
32
+ return base unless base == Bmg::Type::ANY
33
+ attr_list = if rx = options[:parse]
34
+ [:line] + rx.names.map(&:to_sym)
35
+ else
36
+ [:line, :text]
37
+ end
38
+ base
39
+ .with_attrlist(attr_list)
40
+ .with_keys([[:line]])
41
+ end
42
+
43
+ def strip?
44
+ options[:strip]
45
+ end
46
+
47
+ def parse(text)
48
+ return { text: text } unless rx = options[:parse]
49
+ if match = rx.match(text)
50
+ TupleAlgebra.symbolize_keys(match.named_captures)
51
+ end
52
+ end
53
+
54
+ end # class TextFile
55
+ end # module Reader
56
+ end # module Bmg
data/lib/bmg/relation.rb CHANGED
@@ -17,6 +17,22 @@ module Bmg
17
17
  self
18
18
  end
19
19
 
20
+ def type
21
+ Bmg::Type::ANY
22
+ end
23
+
24
+ def with_type(type)
25
+ dup.tap{|r|
26
+ r.type = type
27
+ }
28
+ end
29
+
30
+ def with_type_attrlist
31
+ return self if type.knows_attrlist?
32
+ attrs = self.first.keys
33
+ with_type(type.with_attrlist(attrs))
34
+ end
35
+
20
36
  def with_typecheck
21
37
  dup.tap{|r|
22
38
  r.type = r.type.with_typecheck
@@ -100,6 +116,18 @@ module Bmg
100
116
  end
101
117
  end
102
118
 
119
+ def count
120
+ if type.knows_keys?
121
+ project(type.keys.first)._count
122
+ else
123
+ self._count
124
+ end
125
+ end
126
+
127
+ def _count
128
+ to_a.size
129
+ end
130
+
103
131
  # Returns a json representation
104
132
  def to_json(*args, &bl)
105
133
  to_a.to_json(*args, &bl)
@@ -113,9 +141,10 @@ module Bmg
113
141
  # When no string_or_io is used, the method uses a string.
114
142
  #
115
143
  # The method always returns the string_or_io.
116
- def to_csv(options = {}, string_or_io = nil)
144
+ def to_csv(options = {}, string_or_io = nil, preferences = nil)
117
145
  options, string_or_io = {}, options unless options.is_a?(Hash)
118
- Writer::Csv.new(options).call(self, string_or_io)
146
+ string_or_io, preferences = nil, string_or_io if string_or_io.is_a?(Hash)
147
+ Writer::Csv.new(options, preferences).call(self, string_or_io)
119
148
  end
120
149
 
121
150
  # Converts to an sexpr expression.
@@ -19,6 +19,10 @@ module Bmg
19
19
  def each(&bl)
20
20
  end
21
21
 
22
+ def _count
23
+ 0
24
+ end
25
+
22
26
  def to_ast
23
27
  [ :empty ]
24
28
  end
@@ -8,7 +8,6 @@ module Bmg
8
8
  @type = type
9
9
  end
10
10
  attr_accessor :type
11
- protected :type=
12
11
  attr_reader :operand
13
12
 
14
13
  public
@@ -17,6 +16,16 @@ module Bmg
17
16
  @operand.each(&bl)
18
17
  end
19
18
 
19
+ def _count
20
+ if operand.respond_to?(:count)
21
+ operand.count
22
+ elsif operand.respond_to?(:size)
23
+ operand.size
24
+ else
25
+ super
26
+ end
27
+ end
28
+
20
29
  def to_ast
21
30
  [ :in_memory, operand ]
22
31
  end
@@ -16,6 +16,12 @@ module Bmg
16
16
  end
17
17
  protected :type=
18
18
 
19
+ public
20
+
21
+ def _count
22
+ operand._count
23
+ end
24
+
19
25
  public
20
26
 
21
27
  def each(&bl)
@@ -24,10 +24,15 @@ module Bmg
24
24
  protected :type=
25
25
 
26
26
  def each(&bl)
27
- spy.call(self)
27
+ spy.call(self) if bl
28
28
  operand.each(&bl)
29
29
  end
30
30
 
31
+ def count
32
+ spy.call(self) if bl
33
+ operand.count
34
+ end
35
+
31
36
  def to_ast
32
37
  [ :spied, operand.to_ast, spy ]
33
38
  end
@@ -9,6 +9,7 @@ module Bmg
9
9
  attr_reader :sequel_db
10
10
 
11
11
  def each(&bl)
12
+ return to_enum unless block_given?
12
13
  dataset.each(&bl)
13
14
  end
14
15
 
@@ -33,6 +34,10 @@ module Bmg
33
34
  base_table.update(arg)
34
35
  end
35
36
 
37
+ def _count
38
+ dataset.count
39
+ end
40
+
36
41
  def to_ast
37
42
  [:sequel, dataset.sql]
38
43
  end
@@ -10,7 +10,6 @@ module Bmg
10
10
  end
11
11
 
12
12
  attr_accessor :type
13
- protected :type=
14
13
 
15
14
  protected
16
15
 
@@ -134,8 +133,8 @@ module Bmg
134
133
  _instance(type, builder, expr)
135
134
  end
136
135
 
137
- def _summarize(type, by, summarization)
138
- summarization = Operator::Summarize.compile(summarization)
136
+ def _summarize(type, by, defs)
137
+ summarization = ::Bmg::Summarizer.summarization(defs)
139
138
  if can_compile_summarization?(summarization)
140
139
  expr = before_use(self.expr)
141
140
  expr = Processor::Summarize.new(by, summarization, builder).call(self.expr)
@@ -50,6 +50,21 @@ module Bmg
50
50
  end
51
51
  end
52
52
 
53
+ # Converts some summarization definitions to a Hash of
54
+ # summarizers.
55
+ def self.summarization(defs)
56
+ Hash[defs.map{|k,v|
57
+ summarizer = case v
58
+ when Summarizer then v
59
+ when Symbol then Summarizer.send(v, k)
60
+ when Proc then Summarizer.by_proc(&v)
61
+ else
62
+ raise ArgumentError, "Unexpected summarizer #{k} => #{v}"
63
+ end
64
+ [ k, summarizer ]
65
+ }]
66
+ end
67
+
53
68
  # Returns the default options to use
54
69
  #
55
70
  # @return the default aggregation options
@@ -80,7 +95,16 @@ module Bmg
80
95
  # @param the current iterated tuple
81
96
  # @return updated memo value
82
97
  def happens(memo, tuple)
83
- value = @functor.is_a?(Proc) ? @functor.call(tuple) : tuple[@functor]
98
+ value = case @functor
99
+ when Proc
100
+ @functor.call(tuple)
101
+ when NilClass
102
+ tuple
103
+ when Symbol
104
+ tuple[@functor]
105
+ else
106
+ tuple[@functor]
107
+ end
84
108
  _happens(memo, value)
85
109
  end
86
110
 
@@ -128,5 +152,9 @@ require_relative 'summarizer/max'
128
152
  require_relative 'summarizer/avg'
129
153
  require_relative 'summarizer/variance'
130
154
  require_relative 'summarizer/stddev'
155
+ require_relative 'summarizer/percentile'
131
156
  require_relative 'summarizer/collect'
157
+ require_relative 'summarizer/distinct'
132
158
  require_relative 'summarizer/concat'
159
+ require_relative 'summarizer/by_proc'
160
+ require_relative 'summarizer/multiple'
@@ -16,13 +16,13 @@ module Bmg
16
16
  end
17
17
 
18
18
  # Collects one more value + the sum of all
19
- def _happens(memo, val)
19
+ def _happens(memo, val)
20
20
  [memo.first + val, memo.last + 1]
21
21
  end
22
22
 
23
23
  # Finalizes the computation.
24
- def finalize(memo)
25
- memo.first / memo.last
24
+ def finalize(memo)
25
+ memo.first / memo.last
26
26
  end
27
27
 
28
28
  end # class Avg
@@ -0,0 +1,41 @@
1
+ module Bmg
2
+ class Summarizer
3
+ #
4
+ # Generic summarizer that takes a Proc àla each_with_object.
5
+ #
6
+ # Example:
7
+ #
8
+ # # direct ruby usage
9
+ # Bmg::Summarizer.by_proc{|t,memo| ... }.summarize(...)
10
+ #
11
+ class ByProc < Summarizer
12
+
13
+ def initialize(least, by_proc)
14
+ @least = least
15
+ @by_proc = by_proc
16
+ end
17
+
18
+ # Returns [] as least value.
19
+ def least()
20
+ @least
21
+ end
22
+
23
+ # Adds val to the memo array
24
+ def happens(memo, val)
25
+ @by_proc.call(val, memo)
26
+ end
27
+
28
+ def finalize(memo)
29
+ memo
30
+ end
31
+
32
+ end # class ByProc
33
+
34
+ # Factors a distinct summarizer
35
+ def self.by_proc(least = nil, proc = nil, &bl)
36
+ least, proc = nil, least if least.is_a?(Proc)
37
+ ByProc.new(least, proc || bl)
38
+ end
39
+
40
+ end # class Summarizer
41
+ end # module Bmg