libhxl-ruby 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/libhxl-ruby.rb +193 -0
- data/lib/libhxl-ruby/hxl_col_spec.rb +19 -0
- data/lib/libhxl-ruby/hxl_column.rb +20 -0
- data/lib/libhxl-ruby/hxl_format_error.rb +2 -0
- data/lib/libhxl-ruby/hxl_row.rb +27 -0
- data/lib/libhxl-ruby/hxl_table_spec.rb +45 -0
- data/lib/libhxl-ruby/hxl_value.rb +12 -0
- metadata +65 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 719cfb05f414be3c3f796a5549d30d68ed4e9a03
|
4
|
+
data.tar.gz: 75f03298ddd344a4cca609a3e3db1897da1ad35e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 29f19279e9d8f3d60b2d1d3e536193db92d70ab8936b7b866d9cfd62853548830031dfbb37e95fdd333e5c18c4402afc5097b123e629002b13a4be4e4358820e
|
7
|
+
data.tar.gz: d9db4b7a208fa9bf3c26d650ea9fd250d6f1b4197682648f427069cd8f13921a9dbf9a238b762f776cd33d68fc2a5862110789e3c099e92e21f8b17e88a3c17e
|
data/lib/libhxl-ruby.rb
ADDED
@@ -0,0 +1,193 @@
|
|
1
|
+
class HXLReader
|
2
|
+
|
3
|
+
def initialize
|
4
|
+
|
5
|
+
end
|
6
|
+
|
7
|
+
def self.foreach(path, &block)
|
8
|
+
|
9
|
+
table_spec = nil
|
10
|
+
prev_row = nil
|
11
|
+
|
12
|
+
source_row_number = -1
|
13
|
+
row_number = -1
|
14
|
+
disaggregation_position = 0
|
15
|
+
|
16
|
+
CSV.foreach(path) do |row|
|
17
|
+
|
18
|
+
source_row_number += 1
|
19
|
+
|
20
|
+
# If we don't have a table_spec yet (row of HXL tags), scan for one
|
21
|
+
if table_spec.nil?
|
22
|
+
|
23
|
+
table_spec = self.parse_hashtag_row(row, prev_row)
|
24
|
+
|
25
|
+
next if table_spec
|
26
|
+
|
27
|
+
end
|
28
|
+
prev_row = row
|
29
|
+
next if table_spec.nil?
|
30
|
+
|
31
|
+
|
32
|
+
|
33
|
+
disaggregation_position = 0
|
34
|
+
|
35
|
+
loop do
|
36
|
+
# Next logical row
|
37
|
+
row_number += 1
|
38
|
+
|
39
|
+
hxl_row, disaggregation_position = parse_row(row,
|
40
|
+
table_spec,
|
41
|
+
disaggregation_position,
|
42
|
+
row_number,
|
43
|
+
source_row_number)
|
44
|
+
yield hxl_row
|
45
|
+
|
46
|
+
break unless disaggregation_position < table_spec.get_disaggregation_count
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
raise HXLFormatError.new('HXL hashtag row not found') if table_spec.nil?
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.parse_row(row, table_spec, disaggregation_position, row_number, source_row_number)
|
56
|
+
|
57
|
+
hxl_fields = []
|
58
|
+
col_num = -1
|
59
|
+
|
60
|
+
seen_fixed = false
|
61
|
+
row.each_with_index do |value, source_col_number|
|
62
|
+
|
63
|
+
col_spec = table_spec.col_specs[source_col_number]
|
64
|
+
|
65
|
+
# Only parse HXL columns
|
66
|
+
next if col_spec.column.hxl_tag.nil?
|
67
|
+
|
68
|
+
if col_spec.fixed_column
|
69
|
+
# Looking at disaggregation
|
70
|
+
|
71
|
+
if !seen_fixed
|
72
|
+
col_num += 1
|
73
|
+
raw_position = table_spec.get_raw_position(disaggregation_position)
|
74
|
+
|
75
|
+
hxl_fields.push HXLValue.new(table_spec.col_specs[raw_position].fixed_column,
|
76
|
+
table_spec.col_specs[raw_position].fixed_value,
|
77
|
+
col_num,
|
78
|
+
source_col_number)
|
79
|
+
|
80
|
+
col_num += 1
|
81
|
+
hxl_fields.push HXLValue.new(table_spec.col_specs[raw_position].column,
|
82
|
+
row[raw_position],
|
83
|
+
col_num,
|
84
|
+
source_col_number)
|
85
|
+
|
86
|
+
seen_fixed = true
|
87
|
+
disaggregation_position += 1
|
88
|
+
|
89
|
+
end
|
90
|
+
else
|
91
|
+
# Regular column
|
92
|
+
col_num += 1
|
93
|
+
hxl_fields.push HXLValue.new(table_spec.col_specs[source_col_number].column,
|
94
|
+
value,
|
95
|
+
col_num,
|
96
|
+
source_col_number)
|
97
|
+
|
98
|
+
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
hxl_row = HXLRow.new(
|
103
|
+
table_spec.hxl_headers,
|
104
|
+
hxl_fields,
|
105
|
+
false,
|
106
|
+
row_number,
|
107
|
+
source_row_number)
|
108
|
+
|
109
|
+
return hxl_row, disaggregation_position
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.parse_hashtag_row(row, prev_row)
|
113
|
+
|
114
|
+
# Try parsing the current raw CSV data row as a HXL hashtag row.
|
115
|
+
# Returns a HXLTableSpec on success, or None on failure
|
116
|
+
|
117
|
+
seen_header = false
|
118
|
+
table_spec = HXLTableSpec.new
|
119
|
+
|
120
|
+
row.each_with_index do |value, col_num|
|
121
|
+
value = value.strip if value
|
122
|
+
col_spec = nil
|
123
|
+
|
124
|
+
if !value.nil? && !value.empty?
|
125
|
+
col_spec = self.parse_hashtag(col_num, value)
|
126
|
+
return nil if col_spec.nil?
|
127
|
+
|
128
|
+
seen_header = true
|
129
|
+
|
130
|
+
if col_spec.fixed_column
|
131
|
+
col_spec.fixed_value = prev_row[col_num]
|
132
|
+
end
|
133
|
+
|
134
|
+
else
|
135
|
+
col_spec = HXLColSpec.new col_num, HXLColumn.new
|
136
|
+
end
|
137
|
+
|
138
|
+
table_spec.push col_spec
|
139
|
+
end
|
140
|
+
|
141
|
+
return table_spec if seen_header
|
142
|
+
|
143
|
+
nil
|
144
|
+
end
|
145
|
+
|
146
|
+
def self.parse_hashtag(source_col_number, value)
|
147
|
+
|
148
|
+
# Pattern for a single tag
|
149
|
+
tag_regex = /(#[\w]+)(?:\/([[:alpha:]]{2}))?/
|
150
|
+
|
151
|
+
# Pattern for full tag spec (optional second tag following '+')
|
152
|
+
full_regex = /^\s*#{tag_regex}(?:\s*\+\s*#{tag_regex})?$/
|
153
|
+
|
154
|
+
result = full_regex.match value
|
155
|
+
col_spec = nil
|
156
|
+
|
157
|
+
if result
|
158
|
+
col1 = nil
|
159
|
+
col2 = nil
|
160
|
+
|
161
|
+
if result[3]
|
162
|
+
# There were two tags
|
163
|
+
col1 = HXLColumn.new result[1], result[2]
|
164
|
+
col2 = HXLColumn.new result[3], result[4]
|
165
|
+
col_spec = HXLColSpec.new source_col_number, col2, col1
|
166
|
+
|
167
|
+
else
|
168
|
+
col1 = HXLColumn.new result[1], result[2]
|
169
|
+
col_spec = HXLColSpec.new source_col_number, col1
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
col_spec
|
174
|
+
end
|
175
|
+
|
176
|
+
def self.parse_table_spec(row, prev_row)
|
177
|
+
# Search for the HXL hashtag row
|
178
|
+
# Returns a HXLTableSpec on success. Throws an exception on failure.
|
179
|
+
|
180
|
+
raw = self.parse_source_row
|
181
|
+
end
|
182
|
+
|
183
|
+
def self.parse_source_row
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
|
188
|
+
require 'libhxl-ruby/hxl_format_error'
|
189
|
+
require 'libhxl-ruby/hxl_table_spec'
|
190
|
+
require 'libhxl-ruby/hxl_row'
|
191
|
+
require 'libhxl-ruby/hxl_value'
|
192
|
+
require 'libhxl-ruby/hxl_column'
|
193
|
+
require 'libhxl-ruby/hxl_col_spec'
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class HXLReader::HXLColSpec
|
2
|
+
|
3
|
+
attr_reader :fixed_column, :source_col_number, :column
|
4
|
+
attr_accessor :fixed_value
|
5
|
+
|
6
|
+
# Column metadata for parsing a HXL CSV file
|
7
|
+
#
|
8
|
+
# This class captures the way a column is encoded in the input CSV
|
9
|
+
# file, which might be different from the logical structure of the
|
10
|
+
# HXL data. Used only during parsing.
|
11
|
+
|
12
|
+
def initialize(source_col_number, column = nil, fixed_column = nil, fixed_value = nil)
|
13
|
+
@source_col_number = source_col_number
|
14
|
+
@column = column
|
15
|
+
@fixed_column = fixed_column
|
16
|
+
@fixed_value = fixed_value
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
class HXLReader::HXLColumn
|
2
|
+
|
3
|
+
attr_reader :hxl_tag, :language_code, :header_text
|
4
|
+
|
5
|
+
def initialize(hxl_tag = nil, language_code = nil)
|
6
|
+
@hxl_tag = hxl_tag
|
7
|
+
@language_code = language_code
|
8
|
+
@header_text = self.pretty_tag
|
9
|
+
end
|
10
|
+
|
11
|
+
def pretty_tag
|
12
|
+
return '' unless @hxl_tag
|
13
|
+
|
14
|
+
@hxl_tag
|
15
|
+
.gsub(/^#/, '')
|
16
|
+
.gsub(/_(date|deg|id|link|num)$/, '')
|
17
|
+
.gsub('_', ' ')
|
18
|
+
.upcase
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
class HXLReader::HXLRow < CSV::Row
|
4
|
+
# An iterable row of HXL value objects
|
5
|
+
|
6
|
+
attr_reader :row_number, :source_row_number
|
7
|
+
|
8
|
+
|
9
|
+
def initialize(headers, fields, header_row = false, row_number = nil, source_row_number = nil)
|
10
|
+
super headers, fields, header_row
|
11
|
+
@row_number = row_number
|
12
|
+
@source_row_number = source_row_number
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_s
|
17
|
+
s = '<HXLRow';
|
18
|
+
s += "\n rowNumber: " + @row_number.to_s
|
19
|
+
s += "\n sourceRowNumber: " + @sourceRowNumber.to_s
|
20
|
+
s += "\n"
|
21
|
+
s += @row.map(&:to_s).join('\n ')
|
22
|
+
s += "\n>"
|
23
|
+
|
24
|
+
s
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
class HXLReader::HXLTableSpec
|
2
|
+
|
3
|
+
attr_reader :col_specs
|
4
|
+
|
5
|
+
# Table metadata for parsing a HXL dataset
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@col_specs = []
|
9
|
+
end
|
10
|
+
|
11
|
+
def push(col_spec)
|
12
|
+
@col_specs.push col_spec
|
13
|
+
end
|
14
|
+
|
15
|
+
def hxl_headers
|
16
|
+
headers = []
|
17
|
+
seen_fixed = false
|
18
|
+
@col_specs.each do |spec|
|
19
|
+
if spec.fixed_column && !seen_fixed
|
20
|
+
headers.push spec.fixed_column.hxl_tag
|
21
|
+
headers.push spec.column.hxl_tag unless spec.column.hxl_tag.nil?
|
22
|
+
seen_fixed = true
|
23
|
+
elsif !spec.fixed_column
|
24
|
+
headers.push spec.column.hxl_tag unless spec.column.hxl_tag.nil?
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
headers
|
29
|
+
end
|
30
|
+
|
31
|
+
def get_disaggregation_count
|
32
|
+
(@col_specs.select { |col_spec| col_spec.fixed_column }).length
|
33
|
+
end
|
34
|
+
|
35
|
+
def get_raw_position(disaggregation_position)
|
36
|
+
@col_specs.each_with_index do |col_spec, i|
|
37
|
+
disaggregation_position -= 1 if col_spec.fixed_column
|
38
|
+
|
39
|
+
return i if disaggregation_position < 0
|
40
|
+
end
|
41
|
+
|
42
|
+
return -1
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
class HXLReader::HXLValue
|
2
|
+
# A single HXL value at the intersection of a row and column
|
3
|
+
attr_reader :column, :value, :col_num, :source_col_num
|
4
|
+
|
5
|
+
def initialize(column, value, col_num, source_col_num)
|
6
|
+
@column = column
|
7
|
+
@value = value
|
8
|
+
@col_num = col_num
|
9
|
+
@source_col_num = source_col_num
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
metadata
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: libhxl-ruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ben Rudolph
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-10-19 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '3.1'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '3.1'
|
27
|
+
description: A simple gem to parse your HXL files
|
28
|
+
email: rudolphben@gmail.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- lib/libhxl-ruby.rb
|
34
|
+
- lib/libhxl-ruby/hxl_col_spec.rb
|
35
|
+
- lib/libhxl-ruby/hxl_column.rb
|
36
|
+
- lib/libhxl-ruby/hxl_format_error.rb
|
37
|
+
- lib/libhxl-ruby/hxl_row.rb
|
38
|
+
- lib/libhxl-ruby/hxl_table_spec.rb
|
39
|
+
- lib/libhxl-ruby/hxl_value.rb
|
40
|
+
homepage: https://github.com/benrudolph/libhxl-ruby
|
41
|
+
licenses:
|
42
|
+
- MIT
|
43
|
+
metadata: {}
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - '>='
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
requirements: []
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 2.4.2
|
61
|
+
signing_key:
|
62
|
+
specification_version: 4
|
63
|
+
summary: A HXL parser for ruby
|
64
|
+
test_files: []
|
65
|
+
has_rdoc:
|