hxl 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/hxl.rb +186 -0
- data/lib/hxl/hxl_col_spec.rb +19 -0
- data/lib/hxl/hxl_column.rb +20 -0
- data/lib/hxl/hxl_format_error.rb +2 -0
- data/lib/hxl/hxl_row.rb +27 -0
- data/lib/hxl/hxl_table_spec.rb +45 -0
- metadata +64 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 7617ce05c08e1a823dc5791293080f09bcdcea85
|
|
4
|
+
data.tar.gz: e05db63c8a5c2a8892da21a4008251fa71f2e567
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 9aea704f066a0e4b3de031f33fbac55ee6d52005428e65ada0f487faf9df2e9ea57b18849525bfca35353188987b2579cc98b1f3c1e04c410cc7e861af157a50
|
|
7
|
+
data.tar.gz: ded52c3759063a970a5bab0b84c155ff2cafa4175e9b8b8dbdb89b1cbcbb8521cc7174ccc72d5a2455ccccd5c0e65794d9a65abb12aedbcfb7bd4c8712c363fe
|
data/lib/hxl.rb
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
class HXL
|
|
2
|
+
|
|
3
|
+
def self.read(path)
|
|
4
|
+
rows = []
|
|
5
|
+
self.foreach(path) do |row|
|
|
6
|
+
rows.push row
|
|
7
|
+
end
|
|
8
|
+
rows
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def self.foreach(path, &block)
|
|
12
|
+
|
|
13
|
+
table_spec = nil
|
|
14
|
+
prev_row = nil
|
|
15
|
+
|
|
16
|
+
source_row_number = -1
|
|
17
|
+
row_number = -1
|
|
18
|
+
disaggregation_position = 0
|
|
19
|
+
|
|
20
|
+
CSV.foreach(path) do |row|
|
|
21
|
+
|
|
22
|
+
source_row_number += 1
|
|
23
|
+
|
|
24
|
+
# If we don't have a table_spec yet (row of HXL tags), scan for one
|
|
25
|
+
if table_spec.nil?
|
|
26
|
+
|
|
27
|
+
table_spec = self.parse_hashtag_row(row, prev_row)
|
|
28
|
+
|
|
29
|
+
next if table_spec
|
|
30
|
+
|
|
31
|
+
end
|
|
32
|
+
prev_row = row
|
|
33
|
+
next if table_spec.nil?
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
disaggregation_position = 0
|
|
38
|
+
|
|
39
|
+
loop do
|
|
40
|
+
# Next logical row
|
|
41
|
+
row_number += 1
|
|
42
|
+
|
|
43
|
+
hxl_row, disaggregation_position = parse_row(row,
|
|
44
|
+
table_spec,
|
|
45
|
+
disaggregation_position,
|
|
46
|
+
row_number,
|
|
47
|
+
source_row_number)
|
|
48
|
+
yield hxl_row
|
|
49
|
+
|
|
50
|
+
break unless disaggregation_position < table_spec.get_disaggregation_count
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
raise HXLFormatError.new('HXL hashtag row not found') if table_spec.nil?
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def self.parse_row(row, table_spec, disaggregation_position, row_number, source_row_number)
|
|
60
|
+
|
|
61
|
+
hxl_fields = []
|
|
62
|
+
col_num = -1
|
|
63
|
+
|
|
64
|
+
seen_fixed = false
|
|
65
|
+
row.each_with_index do |value, source_col_number|
|
|
66
|
+
|
|
67
|
+
col_spec = table_spec.col_specs[source_col_number]
|
|
68
|
+
|
|
69
|
+
# Only parse HXL columns
|
|
70
|
+
next if col_spec.column.hxl_tag.nil?
|
|
71
|
+
|
|
72
|
+
if col_spec.fixed_column
|
|
73
|
+
# Looking at disaggregation
|
|
74
|
+
|
|
75
|
+
if !seen_fixed
|
|
76
|
+
col_num += 1
|
|
77
|
+
raw_position = table_spec.get_raw_position(disaggregation_position)
|
|
78
|
+
|
|
79
|
+
hxl_fields.push table_spec.col_specs[raw_position].fixed_value
|
|
80
|
+
|
|
81
|
+
col_num += 1
|
|
82
|
+
hxl_fields.push row[raw_position]
|
|
83
|
+
|
|
84
|
+
seen_fixed = true
|
|
85
|
+
disaggregation_position += 1
|
|
86
|
+
|
|
87
|
+
end
|
|
88
|
+
else
|
|
89
|
+
# Regular column
|
|
90
|
+
col_num += 1
|
|
91
|
+
hxl_fields.push value
|
|
92
|
+
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
hxl_row = HXLRow.new(
|
|
97
|
+
table_spec.hxl_headers,
|
|
98
|
+
hxl_fields,
|
|
99
|
+
false,
|
|
100
|
+
row_number,
|
|
101
|
+
source_row_number)
|
|
102
|
+
|
|
103
|
+
return hxl_row, disaggregation_position
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def self.parse_hashtag_row(row, prev_row)
|
|
107
|
+
|
|
108
|
+
# Try parsing the current raw CSV data row as a HXL hashtag row.
|
|
109
|
+
# Returns a HXLTableSpec on success, or None on failure
|
|
110
|
+
|
|
111
|
+
seen_header = false
|
|
112
|
+
table_spec = HXLTableSpec.new
|
|
113
|
+
|
|
114
|
+
row.each_with_index do |value, col_num|
|
|
115
|
+
value = value.strip if value
|
|
116
|
+
col_spec = nil
|
|
117
|
+
|
|
118
|
+
if !value.nil? && !value.empty?
|
|
119
|
+
col_spec = self.parse_hashtag(col_num, value)
|
|
120
|
+
return nil if col_spec.nil?
|
|
121
|
+
|
|
122
|
+
seen_header = true
|
|
123
|
+
|
|
124
|
+
if col_spec.fixed_column
|
|
125
|
+
col_spec.fixed_value = prev_row[col_num]
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
else
|
|
129
|
+
col_spec = HXLColSpec.new col_num, HXLColumn.new
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
table_spec.push col_spec
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
return table_spec if seen_header
|
|
136
|
+
|
|
137
|
+
nil
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def self.parse_hashtag(source_col_number, value)
|
|
141
|
+
|
|
142
|
+
# Pattern for a single tag
|
|
143
|
+
tag_regex = /(#[\w]+)(?:\/([[:alpha:]]{2}))?/
|
|
144
|
+
|
|
145
|
+
# Pattern for full tag spec (optional second tag following '+')
|
|
146
|
+
full_regex = /^\s*#{tag_regex}(?:\s*\+\s*#{tag_regex})?$/
|
|
147
|
+
|
|
148
|
+
result = full_regex.match value
|
|
149
|
+
col_spec = nil
|
|
150
|
+
|
|
151
|
+
if result
|
|
152
|
+
col1 = nil
|
|
153
|
+
col2 = nil
|
|
154
|
+
|
|
155
|
+
if result[3]
|
|
156
|
+
# There were two tags
|
|
157
|
+
col1 = HXLColumn.new result[1], result[2]
|
|
158
|
+
col2 = HXLColumn.new result[3], result[4]
|
|
159
|
+
col_spec = HXLColSpec.new source_col_number, col2, col1
|
|
160
|
+
|
|
161
|
+
else
|
|
162
|
+
col1 = HXLColumn.new result[1], result[2]
|
|
163
|
+
col_spec = HXLColSpec.new source_col_number, col1
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
col_spec
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
def self.parse_table_spec(row, prev_row)
|
|
171
|
+
# Search for the HXL hashtag row
|
|
172
|
+
# Returns a HXLTableSpec on success. Throws an exception on failure.
|
|
173
|
+
|
|
174
|
+
raw = self.parse_source_row
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def self.parse_source_row
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
require 'hxl/hxl_format_error'
|
|
183
|
+
require 'hxl/hxl_table_spec'
|
|
184
|
+
require 'hxl/hxl_row'
|
|
185
|
+
require 'hxl/hxl_column'
|
|
186
|
+
require 'hxl/hxl_col_spec'
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
class HXL::HXLColSpec
|
|
2
|
+
|
|
3
|
+
attr_reader :fixed_column, :source_col_number, :column
|
|
4
|
+
attr_accessor :fixed_value
|
|
5
|
+
|
|
6
|
+
# Column metadata for parsing a HXL CSV file
|
|
7
|
+
#
|
|
8
|
+
# This class captures the way a column is encoded in the input CSV
|
|
9
|
+
# file, which might be different from the logical structure of the
|
|
10
|
+
# HXL data. Used only during parsing.
|
|
11
|
+
|
|
12
|
+
def initialize(source_col_number, column = nil, fixed_column = nil, fixed_value = nil)
|
|
13
|
+
@source_col_number = source_col_number
|
|
14
|
+
@column = column
|
|
15
|
+
@fixed_column = fixed_column
|
|
16
|
+
@fixed_value = fixed_value
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
class HXL::HXLColumn
|
|
2
|
+
|
|
3
|
+
attr_reader :hxl_tag, :language_code, :header_text
|
|
4
|
+
|
|
5
|
+
def initialize(hxl_tag = nil, language_code = nil)
|
|
6
|
+
@hxl_tag = hxl_tag
|
|
7
|
+
@language_code = language_code
|
|
8
|
+
@header_text = self.pretty_tag
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def pretty_tag
|
|
12
|
+
return '' unless @hxl_tag
|
|
13
|
+
|
|
14
|
+
@hxl_tag
|
|
15
|
+
.gsub(/^#/, '')
|
|
16
|
+
.gsub(/_(date|deg|id|link|num)$/, '')
|
|
17
|
+
.gsub('_', ' ')
|
|
18
|
+
.upcase
|
|
19
|
+
end
|
|
20
|
+
end
|
data/lib/hxl/hxl_row.rb
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
require 'csv'
|
|
2
|
+
|
|
3
|
+
class HXL::HXLRow < CSV::Row
|
|
4
|
+
# An iterable row of HXL value objects
|
|
5
|
+
|
|
6
|
+
attr_reader :row_number, :source_row_number
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def initialize(headers, fields, header_row = false, row_number = nil, source_row_number = nil)
|
|
10
|
+
super headers, fields, header_row
|
|
11
|
+
@row_number = row_number
|
|
12
|
+
@source_row_number = source_row_number
|
|
13
|
+
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def to_s
|
|
17
|
+
s = '<HXLRow';
|
|
18
|
+
s += "\n rowNumber: " + @row_number.to_s
|
|
19
|
+
s += "\n sourceRowNumber: " + @sourceRowNumber.to_s
|
|
20
|
+
s += "\n"
|
|
21
|
+
s += @row.map(&:to_s).join('\n ')
|
|
22
|
+
s += "\n>"
|
|
23
|
+
|
|
24
|
+
s
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
class HXL::HXLTableSpec
|
|
2
|
+
|
|
3
|
+
attr_reader :col_specs
|
|
4
|
+
|
|
5
|
+
# Table metadata for parsing a HXL dataset
|
|
6
|
+
|
|
7
|
+
def initialize
|
|
8
|
+
@col_specs = []
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def push(col_spec)
|
|
12
|
+
@col_specs.push col_spec
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def hxl_headers
|
|
16
|
+
headers = []
|
|
17
|
+
seen_fixed = false
|
|
18
|
+
@col_specs.each do |spec|
|
|
19
|
+
if spec.fixed_column && !seen_fixed
|
|
20
|
+
headers.push spec.fixed_column.hxl_tag
|
|
21
|
+
headers.push spec.column.hxl_tag unless spec.column.hxl_tag.nil?
|
|
22
|
+
seen_fixed = true
|
|
23
|
+
elsif !spec.fixed_column
|
|
24
|
+
headers.push spec.column.hxl_tag unless spec.column.hxl_tag.nil?
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
headers
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def get_disaggregation_count
|
|
32
|
+
(@col_specs.select { |col_spec| col_spec.fixed_column }).length
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def get_raw_position(disaggregation_position)
|
|
36
|
+
@col_specs.each_with_index do |col_spec, i|
|
|
37
|
+
disaggregation_position -= 1 if col_spec.fixed_column
|
|
38
|
+
|
|
39
|
+
return i if disaggregation_position < 0
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
return -1
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: hxl
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.0.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Ben Rudolph
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2014-10-19 00:00:00.000000000 Z
|
|
12
|
+
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: rspec
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - ~>
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '3.1'
|
|
20
|
+
type: :development
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - ~>
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '3.1'
|
|
27
|
+
description: A simple gem to parse your HXL files
|
|
28
|
+
email: rudolphben@gmail.com
|
|
29
|
+
executables: []
|
|
30
|
+
extensions: []
|
|
31
|
+
extra_rdoc_files: []
|
|
32
|
+
files:
|
|
33
|
+
- lib/hxl.rb
|
|
34
|
+
- lib/hxl/hxl_col_spec.rb
|
|
35
|
+
- lib/hxl/hxl_column.rb
|
|
36
|
+
- lib/hxl/hxl_format_error.rb
|
|
37
|
+
- lib/hxl/hxl_row.rb
|
|
38
|
+
- lib/hxl/hxl_table_spec.rb
|
|
39
|
+
homepage: https://github.com/benrudolph/libhxl-ruby
|
|
40
|
+
licenses:
|
|
41
|
+
- MIT
|
|
42
|
+
metadata: {}
|
|
43
|
+
post_install_message:
|
|
44
|
+
rdoc_options: []
|
|
45
|
+
require_paths:
|
|
46
|
+
- lib
|
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
48
|
+
requirements:
|
|
49
|
+
- - '>='
|
|
50
|
+
- !ruby/object:Gem::Version
|
|
51
|
+
version: '0'
|
|
52
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
53
|
+
requirements:
|
|
54
|
+
- - '>='
|
|
55
|
+
- !ruby/object:Gem::Version
|
|
56
|
+
version: '0'
|
|
57
|
+
requirements: []
|
|
58
|
+
rubyforge_project:
|
|
59
|
+
rubygems_version: 2.4.2
|
|
60
|
+
signing_key:
|
|
61
|
+
specification_version: 4
|
|
62
|
+
summary: A HXL parser for ruby
|
|
63
|
+
test_files: []
|
|
64
|
+
has_rdoc:
|