hivemeta 0.0.6 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG CHANGED
@@ -1,3 +1,7 @@
1
+ * 2011-05-21 - fsf
2
+ - new: added table.process* opts hash
3
+ - new: added :ignore_field_count, :field_count_warning, and :file opts
4
+
1
5
  * 2011-05-19 - fsf
2
6
  - thank you ruby-prof!
3
7
  - perf: 4x+ faster ... now basically on par with manual split into array
@@ -3,17 +3,13 @@ module HiveMeta
3
3
  class FieldCountError < StandardError ; end
4
4
 
5
5
  class Record
6
- def initialize(line, table)
6
+ def initialize(line, table, opts = {})
7
7
  @fields = line.chomp.split(table.delimiter, -1)
8
8
  if @fields.size != table.columns.size
9
- raise FieldCountError
9
+ raise FieldCountError if not opts[:ignore_field_count]
10
10
  end
11
11
 
12
12
  @table = table
13
- #@columns = {}
14
- # table.each_col_with_index do |col_name, i|
15
- # #@columns[col_name.to_sym] = @fields[i]
16
- # end
17
13
  end
18
14
 
19
15
  # allow for column access via column name as an index
@@ -23,7 +19,6 @@ module HiveMeta
23
19
  # example: rec[7]
24
20
  def [] index
25
21
  return "#{@fields[index]}" if index.is_a? Integer
26
- #"#{@columns[index.to_sym]}"
27
22
  "#{@fields[@table.indexes[index.to_sym]]}"
28
23
  end
29
24
 
@@ -31,7 +26,6 @@ module HiveMeta
31
26
  # example: rec.col_name
32
27
  def method_missing(id, *args)
33
28
  return @fields[@table.indexes[id]] if @fields[@table.indexes[id]]
34
- #return @columns[id] if @columns[id]
35
29
  raise NoMethodError
36
30
  end
37
31
  end
@@ -41,27 +41,30 @@ module HiveMeta
41
41
 
42
42
  # process a row and return a record that can be queried
43
43
  # by column name in a variety of ways
44
- def process_row(line)
44
+ def process_row(line, opts = {})
45
45
  return nil if not line
46
46
  if block_given?
47
- yield Record.new(line, self)
47
+ yield Record.new(line, self, opts)
48
48
  else
49
- return Record.new(line, self)
49
+ return Record.new(line, self, opts)
50
50
  end
51
51
  end
52
52
 
53
53
  # process all input (default to STDIN for Hadoop Streaming)
54
54
  # via a provided block
55
- def process(f = STDIN, warning = nil)
55
+ def process(opts = {})
56
+ f = opts[:file] || STDIN
57
+
56
58
  if not block_given?
57
- return process_row f.readline
59
+ return process_row(f.readline, opts)
58
60
  end
59
61
 
60
62
  f.each_line do |line|
61
63
  begin
62
- process_row(line) {|row| yield row}
64
+ process_row(line, opts) {|row| yield row}
63
65
  rescue HiveMeta::FieldCountError
64
- warning ||= "reporter:counter:bad_data,row_size,1"
66
+ warning = opts[:field_count_warning]
67
+ warning ||= "reporter:counter:HiveMeta,FieldCountError,1"
65
68
  STDERR.puts warning
66
69
  next
67
70
  end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
+ - 1
7
8
  - 0
8
- - 6
9
- version: 0.0.6
9
+ version: 0.1.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Frank Fejes
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-05-19 00:00:00 -05:00
17
+ date: 2011-05-21 00:00:00 -05:00
18
18
  default_executable:
19
19
  dependencies: []
20
20