hxl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7617ce05c08e1a823dc5791293080f09bcdcea85
4
+ data.tar.gz: e05db63c8a5c2a8892da21a4008251fa71f2e567
5
+ SHA512:
6
+ metadata.gz: 9aea704f066a0e4b3de031f33fbac55ee6d52005428e65ada0f487faf9df2e9ea57b18849525bfca35353188987b2579cc98b1f3c1e04c410cc7e861af157a50
7
+ data.tar.gz: ded52c3759063a970a5bab0b84c155ff2cafa4175e9b8b8dbdb89b1cbcbb8521cc7174ccc72d5a2455ccccd5c0e65794d9a65abb12aedbcfb7bd4c8712c363fe
data/lib/hxl.rb ADDED
@@ -0,0 +1,186 @@
1
+ class HXL
2
+
3
+ def self.read(path)
4
+ rows = []
5
+ self.foreach(path) do |row|
6
+ rows.push row
7
+ end
8
+ rows
9
+ end
10
+
11
+ def self.foreach(path, &block)
12
+
13
+ table_spec = nil
14
+ prev_row = nil
15
+
16
+ source_row_number = -1
17
+ row_number = -1
18
+ disaggregation_position = 0
19
+
20
+ CSV.foreach(path) do |row|
21
+
22
+ source_row_number += 1
23
+
24
+ # If we don't have a table_spec yet (row of HXL tags), scan for one
25
+ if table_spec.nil?
26
+
27
+ table_spec = self.parse_hashtag_row(row, prev_row)
28
+
29
+ next if table_spec
30
+
31
+ end
32
+ prev_row = row
33
+ next if table_spec.nil?
34
+
35
+
36
+
37
+ disaggregation_position = 0
38
+
39
+ loop do
40
+ # Next logical row
41
+ row_number += 1
42
+
43
+ hxl_row, disaggregation_position = parse_row(row,
44
+ table_spec,
45
+ disaggregation_position,
46
+ row_number,
47
+ source_row_number)
48
+ yield hxl_row
49
+
50
+ break unless disaggregation_position < table_spec.get_disaggregation_count
51
+ end
52
+
53
+
54
+ end
55
+
56
+ raise HXLFormatError.new('HXL hashtag row not found') if table_spec.nil?
57
+ end
58
+
59
+ def self.parse_row(row, table_spec, disaggregation_position, row_number, source_row_number)
60
+
61
+ hxl_fields = []
62
+ col_num = -1
63
+
64
+ seen_fixed = false
65
+ row.each_with_index do |value, source_col_number|
66
+
67
+ col_spec = table_spec.col_specs[source_col_number]
68
+
69
+ # Only parse HXL columns
70
+ next if col_spec.column.hxl_tag.nil?
71
+
72
+ if col_spec.fixed_column
73
+ # Looking at disaggregation
74
+
75
+ if !seen_fixed
76
+ col_num += 1
77
+ raw_position = table_spec.get_raw_position(disaggregation_position)
78
+
79
+ hxl_fields.push table_spec.col_specs[raw_position].fixed_value
80
+
81
+ col_num += 1
82
+ hxl_fields.push row[raw_position]
83
+
84
+ seen_fixed = true
85
+ disaggregation_position += 1
86
+
87
+ end
88
+ else
89
+ # Regular column
90
+ col_num += 1
91
+ hxl_fields.push value
92
+
93
+ end
94
+ end
95
+
96
+ hxl_row = HXLRow.new(
97
+ table_spec.hxl_headers,
98
+ hxl_fields,
99
+ false,
100
+ row_number,
101
+ source_row_number)
102
+
103
+ return hxl_row, disaggregation_position
104
+ end
105
+
106
+ def self.parse_hashtag_row(row, prev_row)
107
+
108
+ # Try parsing the current raw CSV data row as a HXL hashtag row.
109
+ # Returns a HXLTableSpec on success, or None on failure
110
+
111
+ seen_header = false
112
+ table_spec = HXLTableSpec.new
113
+
114
+ row.each_with_index do |value, col_num|
115
+ value = value.strip if value
116
+ col_spec = nil
117
+
118
+ if !value.nil? && !value.empty?
119
+ col_spec = self.parse_hashtag(col_num, value)
120
+ return nil if col_spec.nil?
121
+
122
+ seen_header = true
123
+
124
+ if col_spec.fixed_column
125
+ col_spec.fixed_value = prev_row[col_num]
126
+ end
127
+
128
+ else
129
+ col_spec = HXLColSpec.new col_num, HXLColumn.new
130
+ end
131
+
132
+ table_spec.push col_spec
133
+ end
134
+
135
+ return table_spec if seen_header
136
+
137
+ nil
138
+ end
139
+
140
+ def self.parse_hashtag(source_col_number, value)
141
+
142
+ # Pattern for a single tag
143
+ tag_regex = /(#[\w]+)(?:\/([[:alpha:]]{2}))?/
144
+
145
+ # Pattern for full tag spec (optional second tag following '+')
146
+ full_regex = /^\s*#{tag_regex}(?:\s*\+\s*#{tag_regex})?$/
147
+
148
+ result = full_regex.match value
149
+ col_spec = nil
150
+
151
+ if result
152
+ col1 = nil
153
+ col2 = nil
154
+
155
+ if result[3]
156
+ # There were two tags
157
+ col1 = HXLColumn.new result[1], result[2]
158
+ col2 = HXLColumn.new result[3], result[4]
159
+ col_spec = HXLColSpec.new source_col_number, col2, col1
160
+
161
+ else
162
+ col1 = HXLColumn.new result[1], result[2]
163
+ col_spec = HXLColSpec.new source_col_number, col1
164
+ end
165
+ end
166
+
167
+ col_spec
168
+ end
169
+
170
+ def self.parse_table_spec(row, prev_row)
171
+ # Search for the HXL hashtag row
172
+ # Returns a HXLTableSpec on success. Throws an exception on failure.
173
+
174
+ raw = self.parse_source_row
175
+ end
176
+
177
+ def self.parse_source_row
178
+ end
179
+
180
+ end
181
+
182
+ require 'hxl/hxl_format_error'
183
+ require 'hxl/hxl_table_spec'
184
+ require 'hxl/hxl_row'
185
+ require 'hxl/hxl_column'
186
+ require 'hxl/hxl_col_spec'
@@ -0,0 +1,19 @@
1
+ class HXL::HXLColSpec
2
+
3
+ attr_reader :fixed_column, :source_col_number, :column
4
+ attr_accessor :fixed_value
5
+
6
+ # Column metadata for parsing a HXL CSV file
7
+ #
8
+ # This class captures the way a column is encoded in the input CSV
9
+ # file, which might be different from the logical structure of the
10
+ # HXL data. Used only during parsing.
11
+
12
+ def initialize(source_col_number, column = nil, fixed_column = nil, fixed_value = nil)
13
+ @source_col_number = source_col_number
14
+ @column = column
15
+ @fixed_column = fixed_column
16
+ @fixed_value = fixed_value
17
+ end
18
+
19
+ end
@@ -0,0 +1,20 @@
1
+ class HXL::HXLColumn
2
+
3
+ attr_reader :hxl_tag, :language_code, :header_text
4
+
5
+ def initialize(hxl_tag = nil, language_code = nil)
6
+ @hxl_tag = hxl_tag
7
+ @language_code = language_code
8
+ @header_text = self.pretty_tag
9
+ end
10
+
11
+ def pretty_tag
12
+ return '' unless @hxl_tag
13
+
14
+ @hxl_tag
15
+ .gsub(/^#/, '')
16
+ .gsub(/_(date|deg|id|link|num)$/, '')
17
+ .gsub('_', ' ')
18
+ .upcase
19
+ end
20
+ end
@@ -0,0 +1,2 @@
1
+ class HXL::HXLFormatError < StandardError
2
+ end
@@ -0,0 +1,27 @@
1
+ require 'csv'
2
+
3
+ class HXL::HXLRow < CSV::Row
4
+ # An iterable row of HXL value objects
5
+
6
+ attr_reader :row_number, :source_row_number
7
+
8
+
9
+ def initialize(headers, fields, header_row = false, row_number = nil, source_row_number = nil)
10
+ super headers, fields, header_row
11
+ @row_number = row_number
12
+ @source_row_number = source_row_number
13
+
14
+ end
15
+
16
+ def to_s
17
+ s = '<HXLRow';
18
+ s += "\n rowNumber: " + @row_number.to_s
19
+ s += "\n sourceRowNumber: " + @sourceRowNumber.to_s
20
+ s += "\n"
21
+ s += @row.map(&:to_s).join('\n ')
22
+ s += "\n>"
23
+
24
+ s
25
+ end
26
+
27
+ end
@@ -0,0 +1,45 @@
1
+ class HXL::HXLTableSpec
2
+
3
+ attr_reader :col_specs
4
+
5
+ # Table metadata for parsing a HXL dataset
6
+
7
+ def initialize
8
+ @col_specs = []
9
+ end
10
+
11
+ def push(col_spec)
12
+ @col_specs.push col_spec
13
+ end
14
+
15
+ def hxl_headers
16
+ headers = []
17
+ seen_fixed = false
18
+ @col_specs.each do |spec|
19
+ if spec.fixed_column && !seen_fixed
20
+ headers.push spec.fixed_column.hxl_tag
21
+ headers.push spec.column.hxl_tag unless spec.column.hxl_tag.nil?
22
+ seen_fixed = true
23
+ elsif !spec.fixed_column
24
+ headers.push spec.column.hxl_tag unless spec.column.hxl_tag.nil?
25
+ end
26
+ end
27
+
28
+ headers
29
+ end
30
+
31
+ def get_disaggregation_count
32
+ (@col_specs.select { |col_spec| col_spec.fixed_column }).length
33
+ end
34
+
35
+ def get_raw_position(disaggregation_position)
36
+ @col_specs.each_with_index do |col_spec, i|
37
+ disaggregation_position -= 1 if col_spec.fixed_column
38
+
39
+ return i if disaggregation_position < 0
40
+ end
41
+
42
+ return -1
43
+ end
44
+
45
+ end
metadata ADDED
@@ -0,0 +1,64 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hxl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ben Rudolph
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-10-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '3.1'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '3.1'
27
+ description: A simple gem to parse your HXL files
28
+ email: rudolphben@gmail.com
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - lib/hxl.rb
34
+ - lib/hxl/hxl_col_spec.rb
35
+ - lib/hxl/hxl_column.rb
36
+ - lib/hxl/hxl_format_error.rb
37
+ - lib/hxl/hxl_row.rb
38
+ - lib/hxl/hxl_table_spec.rb
39
+ homepage: https://github.com/benrudolph/libhxl-ruby
40
+ licenses:
41
+ - MIT
42
+ metadata: {}
43
+ post_install_message:
44
+ rdoc_options: []
45
+ require_paths:
46
+ - lib
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ requirements:
49
+ - - '>='
50
+ - !ruby/object:Gem::Version
51
+ version: '0'
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - '>='
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ requirements: []
58
+ rubyforge_project:
59
+ rubygems_version: 2.4.2
60
+ signing_key:
61
+ specification_version: 4
62
+ summary: A HXL parser for ruby
63
+ test_files: []
64
+ has_rdoc: