libhxl-ruby 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 719cfb05f414be3c3f796a5549d30d68ed4e9a03
4
+ data.tar.gz: 75f03298ddd344a4cca609a3e3db1897da1ad35e
5
+ SHA512:
6
+ metadata.gz: 29f19279e9d8f3d60b2d1d3e536193db92d70ab8936b7b866d9cfd62853548830031dfbb37e95fdd333e5c18c4402afc5097b123e629002b13a4be4e4358820e
7
+ data.tar.gz: d9db4b7a208fa9bf3c26d650ea9fd250d6f1b4197682648f427069cd8f13921a9dbf9a238b762f776cd33d68fc2a5862110789e3c099e92e21f8b17e88a3c17e
@@ -0,0 +1,193 @@
1
+ class HXLReader
2
+
3
+ def initialize
4
+
5
+ end
6
+
7
+ def self.foreach(path, &block)
8
+
9
+ table_spec = nil
10
+ prev_row = nil
11
+
12
+ source_row_number = -1
13
+ row_number = -1
14
+ disaggregation_position = 0
15
+
16
+ CSV.foreach(path) do |row|
17
+
18
+ source_row_number += 1
19
+
20
+ # If we don't have a table_spec yet (row of HXL tags), scan for one
21
+ if table_spec.nil?
22
+
23
+ table_spec = self.parse_hashtag_row(row, prev_row)
24
+
25
+ next if table_spec
26
+
27
+ end
28
+ prev_row = row
29
+ next if table_spec.nil?
30
+
31
+
32
+
33
+ disaggregation_position = 0
34
+
35
+ loop do
36
+ # Next logical row
37
+ row_number += 1
38
+
39
+ hxl_row, disaggregation_position = parse_row(row,
40
+ table_spec,
41
+ disaggregation_position,
42
+ row_number,
43
+ source_row_number)
44
+ yield hxl_row
45
+
46
+ break unless disaggregation_position < table_spec.get_disaggregation_count
47
+ end
48
+
49
+
50
+ end
51
+
52
+ raise HXLFormatError.new('HXL hashtag row not found') if table_spec.nil?
53
+ end
54
+
55
+ def self.parse_row(row, table_spec, disaggregation_position, row_number, source_row_number)
56
+
57
+ hxl_fields = []
58
+ col_num = -1
59
+
60
+ seen_fixed = false
61
+ row.each_with_index do |value, source_col_number|
62
+
63
+ col_spec = table_spec.col_specs[source_col_number]
64
+
65
+ # Only parse HXL columns
66
+ next if col_spec.column.hxl_tag.nil?
67
+
68
+ if col_spec.fixed_column
69
+ # Looking at disaggregation
70
+
71
+ if !seen_fixed
72
+ col_num += 1
73
+ raw_position = table_spec.get_raw_position(disaggregation_position)
74
+
75
+ hxl_fields.push HXLValue.new(table_spec.col_specs[raw_position].fixed_column,
76
+ table_spec.col_specs[raw_position].fixed_value,
77
+ col_num,
78
+ source_col_number)
79
+
80
+ col_num += 1
81
+ hxl_fields.push HXLValue.new(table_spec.col_specs[raw_position].column,
82
+ row[raw_position],
83
+ col_num,
84
+ source_col_number)
85
+
86
+ seen_fixed = true
87
+ disaggregation_position += 1
88
+
89
+ end
90
+ else
91
+ # Regular column
92
+ col_num += 1
93
+ hxl_fields.push HXLValue.new(table_spec.col_specs[source_col_number].column,
94
+ value,
95
+ col_num,
96
+ source_col_number)
97
+
98
+
99
+ end
100
+ end
101
+
102
+ hxl_row = HXLRow.new(
103
+ table_spec.hxl_headers,
104
+ hxl_fields,
105
+ false,
106
+ row_number,
107
+ source_row_number)
108
+
109
+ return hxl_row, disaggregation_position
110
+ end
111
+
112
+ def self.parse_hashtag_row(row, prev_row)
113
+
114
+ # Try parsing the current raw CSV data row as a HXL hashtag row.
115
+ # Returns a HXLTableSpec on success, or None on failure
116
+
117
+ seen_header = false
118
+ table_spec = HXLTableSpec.new
119
+
120
+ row.each_with_index do |value, col_num|
121
+ value = value.strip if value
122
+ col_spec = nil
123
+
124
+ if !value.nil? && !value.empty?
125
+ col_spec = self.parse_hashtag(col_num, value)
126
+ return nil if col_spec.nil?
127
+
128
+ seen_header = true
129
+
130
+ if col_spec.fixed_column
131
+ col_spec.fixed_value = prev_row[col_num]
132
+ end
133
+
134
+ else
135
+ col_spec = HXLColSpec.new col_num, HXLColumn.new
136
+ end
137
+
138
+ table_spec.push col_spec
139
+ end
140
+
141
+ return table_spec if seen_header
142
+
143
+ nil
144
+ end
145
+
146
+ def self.parse_hashtag(source_col_number, value)
147
+
148
+ # Pattern for a single tag
149
+ tag_regex = /(#[\w]+)(?:\/([[:alpha:]]{2}))?/
150
+
151
+ # Pattern for full tag spec (optional second tag following '+')
152
+ full_regex = /^\s*#{tag_regex}(?:\s*\+\s*#{tag_regex})?$/
153
+
154
+ result = full_regex.match value
155
+ col_spec = nil
156
+
157
+ if result
158
+ col1 = nil
159
+ col2 = nil
160
+
161
+ if result[3]
162
+ # There were two tags
163
+ col1 = HXLColumn.new result[1], result[2]
164
+ col2 = HXLColumn.new result[3], result[4]
165
+ col_spec = HXLColSpec.new source_col_number, col2, col1
166
+
167
+ else
168
+ col1 = HXLColumn.new result[1], result[2]
169
+ col_spec = HXLColSpec.new source_col_number, col1
170
+ end
171
+ end
172
+
173
+ col_spec
174
+ end
175
+
176
+ def self.parse_table_spec(row, prev_row)
177
+ # Search for the HXL hashtag row
178
+ # Returns a HXLTableSpec on success. Throws an exception on failure.
179
+
180
+ raw = self.parse_source_row
181
+ end
182
+
183
+ def self.parse_source_row
184
+ end
185
+
186
+ end
187
+
188
+ require 'libhxl-ruby/hxl_format_error'
189
+ require 'libhxl-ruby/hxl_table_spec'
190
+ require 'libhxl-ruby/hxl_row'
191
+ require 'libhxl-ruby/hxl_value'
192
+ require 'libhxl-ruby/hxl_column'
193
+ require 'libhxl-ruby/hxl_col_spec'
@@ -0,0 +1,19 @@
1
+ class HXLReader::HXLColSpec
2
+
3
+ attr_reader :fixed_column, :source_col_number, :column
4
+ attr_accessor :fixed_value
5
+
6
+ # Column metadata for parsing a HXL CSV file
7
+ #
8
+ # This class captures the way a column is encoded in the input CSV
9
+ # file, which might be different from the logical structure of the
10
+ # HXL data. Used only during parsing.
11
+
12
+ def initialize(source_col_number, column = nil, fixed_column = nil, fixed_value = nil)
13
+ @source_col_number = source_col_number
14
+ @column = column
15
+ @fixed_column = fixed_column
16
+ @fixed_value = fixed_value
17
+ end
18
+
19
+ end
@@ -0,0 +1,20 @@
1
+ class HXLReader::HXLColumn
2
+
3
+ attr_reader :hxl_tag, :language_code, :header_text
4
+
5
+ def initialize(hxl_tag = nil, language_code = nil)
6
+ @hxl_tag = hxl_tag
7
+ @language_code = language_code
8
+ @header_text = self.pretty_tag
9
+ end
10
+
11
+ def pretty_tag
12
+ return '' unless @hxl_tag
13
+
14
+ @hxl_tag
15
+ .gsub(/^#/, '')
16
+ .gsub(/_(date|deg|id|link|num)$/, '')
17
+ .gsub('_', ' ')
18
+ .upcase
19
+ end
20
+ end
@@ -0,0 +1,2 @@
1
+ class HXLReader::HXLFormatError < StandardError
2
+ end
@@ -0,0 +1,27 @@
1
+ require 'csv'
2
+
3
+ class HXLReader::HXLRow < CSV::Row
4
+ # An iterable row of HXL value objects
5
+
6
+ attr_reader :row_number, :source_row_number
7
+
8
+
9
+ def initialize(headers, fields, header_row = false, row_number = nil, source_row_number = nil)
10
+ super headers, fields, header_row
11
+ @row_number = row_number
12
+ @source_row_number = source_row_number
13
+
14
+ end
15
+
16
+ def to_s
17
+ s = '<HXLRow';
18
+ s += "\n rowNumber: " + @row_number.to_s
19
+ s += "\n sourceRowNumber: " + @sourceRowNumber.to_s
20
+ s += "\n"
21
+ s += @row.map(&:to_s).join('\n ')
22
+ s += "\n>"
23
+
24
+ s
25
+ end
26
+
27
+ end
@@ -0,0 +1,45 @@
1
+ class HXLReader::HXLTableSpec
2
+
3
+ attr_reader :col_specs
4
+
5
+ # Table metadata for parsing a HXL dataset
6
+
7
+ def initialize
8
+ @col_specs = []
9
+ end
10
+
11
+ def push(col_spec)
12
+ @col_specs.push col_spec
13
+ end
14
+
15
+ def hxl_headers
16
+ headers = []
17
+ seen_fixed = false
18
+ @col_specs.each do |spec|
19
+ if spec.fixed_column && !seen_fixed
20
+ headers.push spec.fixed_column.hxl_tag
21
+ headers.push spec.column.hxl_tag unless spec.column.hxl_tag.nil?
22
+ seen_fixed = true
23
+ elsif !spec.fixed_column
24
+ headers.push spec.column.hxl_tag unless spec.column.hxl_tag.nil?
25
+ end
26
+ end
27
+
28
+ headers
29
+ end
30
+
31
+ def get_disaggregation_count
32
+ (@col_specs.select { |col_spec| col_spec.fixed_column }).length
33
+ end
34
+
35
+ def get_raw_position(disaggregation_position)
36
+ @col_specs.each_with_index do |col_spec, i|
37
+ disaggregation_position -= 1 if col_spec.fixed_column
38
+
39
+ return i if disaggregation_position < 0
40
+ end
41
+
42
+ return -1
43
+ end
44
+
45
+ end
@@ -0,0 +1,12 @@
1
+ class HXLReader::HXLValue
2
+ # A single HXL value at the intersection of a row and column
3
+ attr_reader :column, :value, :col_num, :source_col_num
4
+
5
+ def initialize(column, value, col_num, source_col_num)
6
+ @column = column
7
+ @value = value
8
+ @col_num = col_num
9
+ @source_col_num = source_col_num
10
+ end
11
+
12
+ end
metadata ADDED
@@ -0,0 +1,65 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: libhxl-ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ben Rudolph
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '3.1'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '3.1'
27
+ description: A simple gem to parse your HXL files
28
+ email: rudolphben@gmail.com
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - lib/libhxl-ruby.rb
34
+ - lib/libhxl-ruby/hxl_col_spec.rb
35
+ - lib/libhxl-ruby/hxl_column.rb
36
+ - lib/libhxl-ruby/hxl_format_error.rb
37
+ - lib/libhxl-ruby/hxl_row.rb
38
+ - lib/libhxl-ruby/hxl_table_spec.rb
39
+ - lib/libhxl-ruby/hxl_value.rb
40
+ homepage: https://github.com/benrudolph/libhxl-ruby
41
+ licenses:
42
+ - MIT
43
+ metadata: {}
44
+ post_install_message:
45
+ rdoc_options: []
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - '>='
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ requirements: []
59
+ rubyforge_project:
60
+ rubygems_version: 2.4.2
61
+ signing_key:
62
+ specification_version: 4
63
+ summary: A HXL parser for ruby
64
+ test_files: []
65
+ has_rdoc: