nodepile 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec +2 -0
- data/.rubocop.yml +1 -1
- data/BACKLOG.md +34 -0
- data/Rakefile +92 -2
- data/lib/nodepile/base_structs.rb +62 -0
- data/lib/nodepile/colspecs.rb +562 -0
- data/lib/nodepile/gross_actions.rb +38 -0
- data/lib/nodepile/gviz.rb +108 -0
- data/lib/nodepile/keyed_array.rb +386 -0
- data/lib/nodepile/pile_organizer.rb +258 -0
- data/lib/nodepile/pragmas.rb +97 -0
- data/lib/nodepile/rec_source.rb +329 -0
- data/lib/nodepile/rule_eval.rb +155 -0
- data/lib/nodepile/version.rb +1 -1
- data/nodepile.gemspec +53 -0
- data/tmp/.gitignore +1 -0
- metadata +136 -19
@@ -0,0 +1,258 @@
|
|
1
|
+
require 'nodepile/colspecs.rb'
|
2
|
+
require 'nodepile/rec_source.rb'
|
3
|
+
require 'nodepile/pragmas.rb'
|
4
|
+
require 'nodepile/rule_eval.rb'
|
5
|
+
|
6
|
+
module Nodepile
|
7
|
+
|
8
|
+
# Container class for managing a Nodepile. A nodepile consists of a set of
|
9
|
+
# entities including nodes, edges, and rules. It includes methods for
|
10
|
+
# enumerating the various items in the collection, filtering, and
|
11
|
+
# deducing the existence of implied edges using rules.
|
12
|
+
class PileOrganizer
|
13
|
+
|
14
|
+
# see nodepile/base_structs.rb for definition of an Nodepile::EntityPacket
|
15
|
+
class ERecStack; end #ERecStack defined further down
|
16
|
+
class RuleCache; end # defined further down
|
17
|
+
SourceData = Struct.new(:source_name,:highest_sequence_num)
|
18
|
+
|
19
|
+
|
20
|
+
def initialize()
|
21
|
+
@nodes = Hash.new{|h,k| h[k] = ERecStack.new}
|
22
|
+
@edges = Hash.new{|h,k| h[k] = ERecStack.new}
|
23
|
+
@rules = Array.new # not subject to overlaying with themselves
|
24
|
+
@pragmas = Nodepile::Pragmas.new
|
25
|
+
@sources = Hash.new{|h,k| h[k] = SourceData.new(k,0)} # insert a dummy source for unspecifie
|
26
|
+
@last_source_name = nil
|
27
|
+
@dirty = true
|
28
|
+
end
|
29
|
+
|
30
|
+
# If a source name is not specified, then the source is assumed to be
|
31
|
+
# the last source that was used to append. If no sequence number is provided
|
32
|
+
# then the sequence_number is assumed to be one more than the highest
|
33
|
+
# sequence number that was specified. If callers are manually specifying
|
34
|
+
# sequence numbers for a source, they should do so consistently to avoid
|
35
|
+
# repeats.
|
36
|
+
# @param kaa [KeyedArrayAccessor] Includes metadata about @type, @key, @is_implied
|
37
|
+
# @return [self]
|
38
|
+
def append(kaa)
|
39
|
+
@last_source_name = kaa.source || @last_source_name
|
40
|
+
source_data = @sources[@last_source_name]
|
41
|
+
# note that the way things work below deliberately "overlays" items
|
42
|
+
# when a matching key is encountered. Rule recalculation is deferred
|
43
|
+
case kaa['@type']
|
44
|
+
when :node
|
45
|
+
@nodes[kaa['@key']] << kaa
|
46
|
+
when :edge
|
47
|
+
@edges[kaa['@key']] << kaa
|
48
|
+
when :rule
|
49
|
+
@rules << RuleCache.new(kaa)
|
50
|
+
when :pragma
|
51
|
+
@pragmas.parse(kaa['_id'])
|
52
|
+
else
|
53
|
+
raise "Unhandled entity entity type #{kaa['@type'].inspect}"
|
54
|
+
end #case
|
55
|
+
return self
|
56
|
+
end
|
57
|
+
|
58
|
+
def node_count() = @nodes.length
|
59
|
+
def rule_count() = @rules.length
|
60
|
+
def edge_count() = @edges.length
|
61
|
+
def pragmas() = @pragmas
|
62
|
+
|
63
|
+
def entity_record(key)
|
64
|
+
_update_rule_impacts
|
65
|
+
case key
|
66
|
+
when String
|
67
|
+
return @nodes[key]
|
68
|
+
when Array
|
69
|
+
return @edges[key]
|
70
|
+
else
|
71
|
+
raise "Unrecognized key structure/type"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Provide summarized records in order
|
76
|
+
def edge_records
|
77
|
+
return enum_for(:edge_records) unless block_given?
|
78
|
+
_update_rule_impacts
|
79
|
+
@edges.each_value{|erstack| yield(erstack.summary) }
|
80
|
+
end
|
81
|
+
|
82
|
+
# Provide the summarized records
|
83
|
+
def node_records
|
84
|
+
return enum_for(:node_records) unless block_given?
|
85
|
+
_update_rule_impacts
|
86
|
+
@nodes.each_value{|erstack| yield(erstack.summary) }
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
# Alias for #append()
|
91
|
+
# @param entity_record [Nodepile::KeyedArrayAccessor]
|
92
|
+
def <<(entity_record) = append(entity_record)
|
93
|
+
|
94
|
+
# Loads the given file (on top of anything already stored in this object)
|
95
|
+
def load_from_file(tabular_filepath)
|
96
|
+
source = Nodepile::TabularRecordSource.new(tabular_filepath,format: :guess)
|
97
|
+
specs = nil
|
98
|
+
loaded_entity_count = 0
|
99
|
+
rec_src_meta = {'path' => tabular_filepath,'rec_num' => nil}
|
100
|
+
metadata = Hash.new
|
101
|
+
source.each{|(rec,rec_num)|
|
102
|
+
rec_src_meta['rec_num'] = rec_num
|
103
|
+
if specs.nil? #first row is header
|
104
|
+
specs = Nodepile::InputColumnSpecs.new(rec,metadata_key_prefix: '@')
|
105
|
+
else
|
106
|
+
begin
|
107
|
+
specs.parse(rec,source: tabular_filepath,
|
108
|
+
ref_num: rec_num,
|
109
|
+
metadata: metadata,
|
110
|
+
){|keyed_array_accessor|
|
111
|
+
append(keyed_array_accessor)
|
112
|
+
loaded_entity_count += 1
|
113
|
+
}
|
114
|
+
rescue Nodepile::InputColumnSpecs::InvalidRecordError => err
|
115
|
+
# re-raise but add info about the record number that triggered the error
|
116
|
+
err.rec_num = rec_num
|
117
|
+
err.file_path = tabular_filepath
|
118
|
+
raise # re-raise
|
119
|
+
end
|
120
|
+
end #if
|
121
|
+
}
|
122
|
+
return loaded_entity_count
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
private
|
127
|
+
# This sledgehammer approach deletes all calculated impacts that may
|
128
|
+
# have previously been applied to the @edges and @nodes and recalculates
|
129
|
+
# all of them.
|
130
|
+
#
|
131
|
+
# Method is a no-op if the structrues are up-to-date
|
132
|
+
def _update_rule_impacts(force = false)
|
133
|
+
return nil unless force || @dirty
|
134
|
+
@nodes.each_value(&:purge_rule_overlays)
|
135
|
+
@edges.each_value(&:purge_rule_overlays)
|
136
|
+
@rules.each{|rulecache|
|
137
|
+
recs = (rulecache.relevant_entity_type == :node ? @nodes : @edges).each_value
|
138
|
+
recs.each{|erstack|
|
139
|
+
if rulecache.match?(erstack.summary)
|
140
|
+
# calculate the rule as applied to the given node/edge
|
141
|
+
calculated_rule_erec = rulecache.eval_using(erstack.summary)
|
142
|
+
erstack << calculated_rule_erec
|
143
|
+
end
|
144
|
+
}
|
145
|
+
} #loop over rules
|
146
|
+
@dirty = false
|
147
|
+
end
|
148
|
+
|
149
|
+
# An ERecStack is a data structure used for holding and summarizing
|
150
|
+
# overlay-able records related to a given Node or Edge which can include
|
151
|
+
# "rules" that apply to that node/stack
|
152
|
+
class ERecStack
|
153
|
+
def initialize()
|
154
|
+
@a = Array.new
|
155
|
+
@summary = nil
|
156
|
+
@mc = CrudeCalculationCache.new
|
157
|
+
end
|
158
|
+
|
159
|
+
def inspect
|
160
|
+
"#<#{self.class}:0x#{object_id} type= #{type} key=#{self.key.inspect} depth=#{@a.length}> "
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
def type = @a.first['@type']
|
165
|
+
def key() = @a.first['@key']
|
166
|
+
|
167
|
+
def is_node? = self.type == :node
|
168
|
+
def is_edge? = self.type == :edge
|
169
|
+
def summary() = @summary
|
170
|
+
def to_a = @a
|
171
|
+
|
172
|
+
# A stack of type :node or :edge is implied if it contains
|
173
|
+
# no ERec records where the is_implied attribute is false.
|
174
|
+
# The return value of this method is undefined for types
|
175
|
+
def is_implied
|
176
|
+
@a.each{|kaa| return false if !kaa['@is_implied'] &&
|
177
|
+
[:node,:edge].include?(kaa['@type']) }
|
178
|
+
return true
|
179
|
+
end
|
180
|
+
|
181
|
+
# Delete overlayed rule records
|
182
|
+
# @return [void]
|
183
|
+
def purge_rule_overlays()
|
184
|
+
@a.delete_if{|rec| rec['@type'] == :rule}
|
185
|
+
@a.each{|rec| @summary = self.class._update_summary(@summary,rec)} #recalc
|
186
|
+
end
|
187
|
+
|
188
|
+
|
189
|
+
# Note that this method does not verify whether it is appropriate to stack the new
|
190
|
+
# record and assumes callers have alreay done this due-diligence.
|
191
|
+
# @param erec [KeyedArrayAccessor]
|
192
|
+
def <<(rec)
|
193
|
+
raise "ERecStack may only hold objects of type Nodepile::KeyedArrayAccessor" unless rec.is_a?(Nodepile::KeyedArrayAccessor)
|
194
|
+
# Keep the summary up-to-date if we've got one.
|
195
|
+
@a << rec
|
196
|
+
if @a.length == 1
|
197
|
+
@summary = rec.dup
|
198
|
+
else
|
199
|
+
@summary = self.class._update_summary(@summary,rec)
|
200
|
+
end
|
201
|
+
return self
|
202
|
+
end
|
203
|
+
|
204
|
+
def each_keyed_array()
|
205
|
+
return enum_for(:each_keyed_array) unless block_given?
|
206
|
+
@a.each{|erec| yield erec }
|
207
|
+
end
|
208
|
+
|
209
|
+
private
|
210
|
+
def self._update_summary(cur_summary,new_overlay)
|
211
|
+
return new_overlay if cur_summary.nil?
|
212
|
+
cur_summary.underlay!(new_overlay)
|
213
|
+
cur_summary.source = nil if cur_summary.source != new_overlay.source
|
214
|
+
cur_summary.ref_num = nil #summary no longer represents a single ref_num
|
215
|
+
cur_summary.update_metadata('@is_implied',false) unless new_overlay['@is_implied']
|
216
|
+
return cur_summary
|
217
|
+
end
|
218
|
+
|
219
|
+
end #class Nodepile::PileOrganizer::ERecStack
|
220
|
+
|
221
|
+
# Represents cached information about a single specific rule
|
222
|
+
class RuleCache
|
223
|
+
|
224
|
+
# @param rule_erec[KeyedArrayAccessor] created from this rule record
|
225
|
+
def initialize(rule_erec)
|
226
|
+
@er = rule_erec.freeze #just for to avoid casual alteration
|
227
|
+
raise "Only ERecs of type :rule may be stored in this structure" unless self.type == :rule
|
228
|
+
@verifiers = [*self.key].map{|s| Nodepile::InputColumnSpecs.make_pattern_match_verifier(s)}
|
229
|
+
@rule_eval = RuleRecordEvaluator.new(@er)
|
230
|
+
end
|
231
|
+
|
232
|
+
def inspect
|
233
|
+
"#<#{self.class}:0x#{object_id} key=#{self.key.inspect} > "
|
234
|
+
end
|
235
|
+
|
236
|
+
def type = @er['@type']
|
237
|
+
def is_implied = @er['@is_implied']
|
238
|
+
def key = @er['@key']
|
239
|
+
|
240
|
+
# Note that a rule that uses dynamic matching cannot precalculate which
|
241
|
+
# records it matches and must (to be safe) reclculate the match in response
|
242
|
+
# to any changes
|
243
|
+
def uses_dynamic_match? = @rule_eval.uses_dynamic_match?
|
244
|
+
|
245
|
+
# @param kaa [KeyedArrayAccessor] a set of field values that will
|
246
|
+
# tested against this rule for matching.
|
247
|
+
def match?(kaa) = @rule_eval.match_record?(kaa)
|
248
|
+
def eval_using(kaa) = @rule_eval.calculate_rule(kaa)
|
249
|
+
|
250
|
+
|
251
|
+
def relevant_entity_type = @er['@key'].is_a?(String) ? :node : :edge
|
252
|
+
|
253
|
+
end #class Nodepile::PileOrganizer::RuleCache
|
254
|
+
|
255
|
+
end #class PileOrganizer
|
256
|
+
|
257
|
+
|
258
|
+
end #module Nodepile
|
@@ -0,0 +1,97 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Nodepile
|
4
|
+
|
5
|
+
# Pragmas is a parser and organizer class used to interpret the meaning
|
6
|
+
# of pragmas that may appear in source files. By default, pragmas are
|
7
|
+
# coded instructions for rendering, parsing, or layout that may be embedded
|
8
|
+
# in input files. They are often instructions stored in the "_id" field
|
9
|
+
# of an input file that begin with a specific indicating string '#pragma '
|
10
|
+
# Pragmas may be used to control things like the specific layout engine that
|
11
|
+
# is used to visualize a graph (e.g. dot versus neato)
|
12
|
+
# Example pragma lines are.
|
13
|
+
#
|
14
|
+
# #pragma neato
|
15
|
+
# #pragma unflatten
|
16
|
+
#
|
17
|
+
# Create an instance of a Nodepile::Pragmas object in order to track and
|
18
|
+
# interpret the collective pragmas of a given graph. Note that the most
|
19
|
+
# common rule in #pragma interpretation is that if two pragmas contradict each
|
20
|
+
# other, the furthest down in the file/parsing stream dominates.
|
21
|
+
class Pragmas
|
22
|
+
DEFAULT_PRAGMA_MARKER = "#pragma "
|
23
|
+
|
24
|
+
@@indicator_patterns = Array.new # array of [pragma_sym,regexp]
|
25
|
+
|
26
|
+
def initialize(pragma_marker: DEFAULT_PRAGMA_MARKER)
|
27
|
+
@marker = pragma_marker.freeze
|
28
|
+
@indicators = Hash.new #name mapped to value
|
29
|
+
# if you make this method more complex, remember to update #dup
|
30
|
+
end
|
31
|
+
|
32
|
+
def dup
|
33
|
+
c = self.class.new(pragma_marker: @marker)
|
34
|
+
c._indicators.merge!(@indicators)
|
35
|
+
return c
|
36
|
+
end
|
37
|
+
|
38
|
+
def [](pragma_sym)
|
39
|
+
@indicators[pragma_sym]
|
40
|
+
end
|
41
|
+
|
42
|
+
# @yield [pragma_sym,pragma_val] provides pairs of name values as they have
|
43
|
+
# been set. Only set values will appear in the yielded set.
|
44
|
+
def each_setting_pair
|
45
|
+
return enum_for(__method__) unless block_given
|
46
|
+
@indicators.each_pair{|k,v| yield(k,v) }
|
47
|
+
return @indicators.length
|
48
|
+
end
|
49
|
+
|
50
|
+
# Parse the given pragma and store the meaning for access via square bracket
|
51
|
+
# method or the #each_setting_pair method.
|
52
|
+
# @return [void]
|
53
|
+
def parse(pragma_string)
|
54
|
+
raise "Expecting pragma_string to start with [#{@marker}]" unless pragma_string.start_with?(@marker)
|
55
|
+
|
56
|
+
#TODO: I there are more complicated parsing rules, they should go before
|
57
|
+
# the simple fallthrough case of whitespace separated indicators
|
58
|
+
|
59
|
+
# Simple indicators are single "word" values where each value is delimited
|
60
|
+
# by whitespace. If two indicators apply to the same pragma_sym,
|
61
|
+
pragma_string[@marker.length..-1].split(/\s+/).each{|s|
|
62
|
+
prag_sym,_ = @@indicator_patterns.find{|(prag_sym,rx)| rx.match(s) }
|
63
|
+
if prag_sym
|
64
|
+
@indicators[prag_sym] = $1
|
65
|
+
else
|
66
|
+
raise "Unrecognized pragma encountered [#{s}]"
|
67
|
+
end
|
68
|
+
}
|
69
|
+
return nil
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
# Declare simple indicators are pragmas whose presence or absence is the indicator.
|
75
|
+
# such indicators can be stacked (multiple per pragma expression)
|
76
|
+
# For example:
|
77
|
+
# #pragma neato unflatten
|
78
|
+
#
|
79
|
+
# The above line would mean that two sepearate effects were being invoked,
|
80
|
+
# the use of the "neato" rendering engine and the use of the unflatten
|
81
|
+
# setting to improve the aspect ratio of some directed graphs
|
82
|
+
def self._decl_simple_indicator(prag_name,alt_regexp)
|
83
|
+
@@indicator_patterns<< [prag_name,alt_regexp].freeze
|
84
|
+
end
|
85
|
+
|
86
|
+
_decl_simple_indicator(:layout_engine,/^(dot|neato|fdp|sfdp|circo|twopi|nop2|osage|patchwork)$/)
|
87
|
+
_decl_simple_indicator(:directionality,/^(graph|digraph)$/)
|
88
|
+
#_decl_simple_indicators(:unflatten,/^(unflatten)$/) # not sure this is supported
|
89
|
+
|
90
|
+
# Crude accessor (protected) for duplication and merge
|
91
|
+
def _indicators = @indicator
|
92
|
+
|
93
|
+
|
94
|
+
end #class Nodepile::Pragmas
|
95
|
+
|
96
|
+
|
97
|
+
end #module Nodepile
|
@@ -0,0 +1,329 @@
|
|
1
|
+
require 'stringio'
|
2
|
+
|
3
|
+
module Nodepile
|
4
|
+
|
5
|
+
# Generates "Factories" for harvesting tabular data from a source stream/file.
|
6
|
+
# Includes facilities for parsing common file formats (CSV/TSV).
|
7
|
+
# Includes facilities for handling common problems encountered when parsing
|
8
|
+
# manually-created tabular data files such as: relevant tabular data is not
|
9
|
+
# aligned "top-left", tabular data includes blank or repeated columns,
|
10
|
+
# tabular data ends before end of file
|
11
|
+
# summary rows appear in the tabular data that need to be ignored.
|
12
|
+
class TabularRecordSource
|
13
|
+
include Enumerable
|
14
|
+
|
15
|
+
DEFAULT_LOADING_GUIDELINES = {
|
16
|
+
mandatory_headers: [], # this can be extremely important to correctly finding tables
|
17
|
+
format: :csv||:tsv||:guess, #assume CSV unless told otherwise
|
18
|
+
allow_leading_skip_rows: 10, # arbitrary content that may appear before table
|
19
|
+
allow_gap_rows: 2||nil, # entirely blank rows appearing mid-tabl, nil indicates allow infinite
|
20
|
+
allow_gap_columns: 1, # columns which have a blank header within the table
|
21
|
+
allow_left_offset: 5, # blank columnns allowed left of table
|
22
|
+
duplicate_header_rule: :first||:last||:ignore||:rename||:fail, #keep the first
|
23
|
+
ignored_header_char: '#', # header names starting with this are just plain ignored
|
24
|
+
emit_blank_records: false, # unless true, entirely blank records are not returned
|
25
|
+
trim_headers: true, #strip leading and trailing spaces
|
26
|
+
}.freeze
|
27
|
+
|
28
|
+
# Create a new RecordSource intended to read from the specified input
|
29
|
+
# and using the parsing strategy specified by the loading guidelines.
|
30
|
+
def initialize(source,**loading_guidelines)
|
31
|
+
(loading_guidelines.keys - DEFAULT_LOADING_GUIDELINES.keys).tap{|x| raise <<~ERRMSG unless x.empty?}
|
32
|
+
Unrecognized named parameters used for RecordSource creation #{x.inspect}
|
33
|
+
ERRMSG
|
34
|
+
@loading_guidelines = DEFAULT_LOADING_GUIDELINES.merge(loading_guidelines).freeze
|
35
|
+
raise "The source must be non-nil" if source.nil?
|
36
|
+
@source = source # will lazy load
|
37
|
+
@is_mid_read = false # only relevant for non-parallel sources
|
38
|
+
@replayable_flag = if @source.is_a?(String)
|
39
|
+
:parallel # simultaneous each() is okay
|
40
|
+
elsif @source.respond_to?(:rewind)
|
41
|
+
:single # can't guarantee simultaneous each() safe
|
42
|
+
else
|
43
|
+
nil
|
44
|
+
end
|
45
|
+
end #new
|
46
|
+
|
47
|
+
|
48
|
+
# Yields the "records" of the first "table" encountered in the bound data
|
49
|
+
# source according to the parameters it was given. First row yielded is
|
50
|
+
# always the header. Raises an error if a header is not found.
|
51
|
+
# Beware... depending on the type of data source used at creation, it
|
52
|
+
# may not be possible to rewind or retrieve data in parallel.
|
53
|
+
# With that said, a filename or String both allow parallel retrieval.
|
54
|
+
#
|
55
|
+
# Also note that blank strings will be passed through until the specified
|
56
|
+
# allow_gap_rows are exceeded. This can mean trailing blanks in long files.
|
57
|
+
#
|
58
|
+
# @yieldparam [Array] Array includes at least two elements. The first is
|
59
|
+
# an Array of "fields". The second element is the record
|
60
|
+
# number within the source (zero index). It's important
|
61
|
+
# to note if any field contains embedded newlines, the record
|
62
|
+
# number is not the same as the line number
|
63
|
+
# @return [Integer,Enumerator] Returns enumerator if no block is given.
|
64
|
+
# Otherwise returns the count of records yielded excluding
|
65
|
+
# the header line.
|
66
|
+
def each(&block)
|
67
|
+
return enum_for(:each) unless block_given?
|
68
|
+
raise "This data source type may only be read once." if @source.nil?
|
69
|
+
raise <<~ERRMSG if @is_mid_read && @replayable_flag != :parallel
|
70
|
+
For this type of data source, you may not read simultaneously.
|
71
|
+
ERRMSG
|
72
|
+
@is_mid_read = true
|
73
|
+
scanner = self.class._make_record_stream(@source,format: @loading_guidelines[:format])
|
74
|
+
scanner = self.class._reposition_to_header_rec(scanner,@loading_guidelines)
|
75
|
+
raw_header,header_pos = scanner.next
|
76
|
+
header_range = self.class._calc_header_range(raw_header,@loading_guidelines[:allow_gap_columns])
|
77
|
+
# process the header line to create a "mask"
|
78
|
+
yield [raw_header[header_range],header_pos] # return the trimmed header
|
79
|
+
rec_count = self.class._emit_rows(scanner,header_range,
|
80
|
+
@loading_guidelines[:emit_blank_records],
|
81
|
+
trim_headers: @loading_guidelines[:trim_headers],
|
82
|
+
tolerate_blanks: @loading_guidelines[:allow_gap_rows],
|
83
|
+
&block
|
84
|
+
)
|
85
|
+
@is_mid_read = false
|
86
|
+
@source = nil if @replayable_flag.nil? # release resources
|
87
|
+
return rec_count
|
88
|
+
end #each
|
89
|
+
|
90
|
+
|
91
|
+
###########################################################################
|
92
|
+
private
|
93
|
+
|
94
|
+
SEPARATOR_CHAR_LIST = {tsv: "\t", csv: ','}.freeze
|
95
|
+
|
96
|
+
# Note, due to the need to terminate if too many blanks, this may not read
|
97
|
+
# to the end of the file
|
98
|
+
def self._emit_rows(raw_rec_enum,range_mask,emit_blank_records,trim_headers:,
|
99
|
+
tolerate_blanks: nil,
|
100
|
+
&block
|
101
|
+
)
|
102
|
+
contig_blank_count = 0
|
103
|
+
emitted_record_count = 0
|
104
|
+
need_to_trim_row = trim_headers # trim the first one
|
105
|
+
loop do
|
106
|
+
begin
|
107
|
+
rec,pos = raw_rec_enum.next
|
108
|
+
masked_rec = rec&.[](range_mask)
|
109
|
+
next if masked_rec.nil?
|
110
|
+
is_blank_record = masked_rec.all?{|s| s.nil? || /^\s*$/.match?(s)}
|
111
|
+
contig_blank_count = is_blank_record ? (contig_blank_count+1) : 0
|
112
|
+
if tolerate_blanks && contig_blank_count > tolerate_blanks
|
113
|
+
return emitted_record_count # end of records
|
114
|
+
else
|
115
|
+
if emit_blank_records || !is_blank_record
|
116
|
+
if need_to_trim_row # only done once (for header) if at all
|
117
|
+
masked_rec.map!{|s| s&.strip}
|
118
|
+
need_to_trim_row = false # only first emitted row
|
119
|
+
end
|
120
|
+
yield [masked_rec,pos]
|
121
|
+
emitted_record_count += 1
|
122
|
+
end
|
123
|
+
end
|
124
|
+
rescue StopIteration
|
125
|
+
return emitted_record_count # running out of records is an okay end
|
126
|
+
end # rescuing
|
127
|
+
end #loop over records
|
128
|
+
raise "Unexpected issue encountered." # should never get here
|
129
|
+
end # self._emit_rows()
|
130
|
+
|
131
|
+
MIN_NONBLANK_HEADER_BEFORE_GAP_ALLOWED = 3
|
132
|
+
|
133
|
+
# Given a presumed header, identify the position of the largest contiguous
|
134
|
+
# block of non-blank fields and return the range information to be used
|
135
|
+
# for scraping successive rows.
|
136
|
+
def self._calc_header_range(raw_header_record,max_blank_cols)
|
137
|
+
max_blank_cols ||= 0 # default is no blank columns tolerated in header
|
138
|
+
blank_tol = max_blank_cols
|
139
|
+
ix0 = nil
|
140
|
+
runs = Array.new
|
141
|
+
(0..(raw_header_record.length-1)).each{|ix|
|
142
|
+
if raw_header_record[ix].nil? || /^\s*$/.match?(raw_header_record[ix])
|
143
|
+
if ix0.nil?
|
144
|
+
# deliberate no-op, in middle of blank run
|
145
|
+
elsif blank_tol >= 0 && ix-ix0 >= MIN_NONBLANK_HEADER_BEFORE_GAP_ALLOWED
|
146
|
+
# at least two content-filled columns must be found before tolerating blanks
|
147
|
+
blank_tol -= 1
|
148
|
+
else
|
149
|
+
runs << [ix0,ix-1]
|
150
|
+
ix0 = nil
|
151
|
+
blank_tol = max_blank_cols
|
152
|
+
end
|
153
|
+
else # non blank
|
154
|
+
ix0 ||= ix # record start of run
|
155
|
+
blank_tol = max_blank_cols # reset tolerance for blanks
|
156
|
+
end
|
157
|
+
}
|
158
|
+
runs << [ix0,raw_header_record.length-1-(max_blank_cols-blank_tol)] if ix0
|
159
|
+
widest = runs.max{|a,b| a[1]-a[0] <=> b[1]-b[0] }
|
160
|
+
return widest && (widest[0]..widest[1]) # range defines
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
# Opens up a record stream based on the source provided and the format rule
|
165
|
+
# specified.
|
166
|
+
# @param source [] Many different values are possible
|
167
|
+
# 1) a filepath to a readable text file
|
168
|
+
# 2) a string variable (must contain at least one newline)
|
169
|
+
# 3) An enumerable of strings where each string is a record "line" to be parsed individually
|
170
|
+
# 4) An enumerable of arrays of strings (where the inner array is the column values).
|
171
|
+
# In this case the format parameter is ignored
|
172
|
+
# @param format [:csv,:tsv,:guess,Regexp] Indicates how to interpret column delimiters
|
173
|
+
# and row delimiters. The format parameter is ignored if the source
|
174
|
+
# is an enumerable of
|
175
|
+
# @return [Enumerator<Array>] Whose next() method returns two-element arrays
|
176
|
+
# Where the first element is the fields of the record
|
177
|
+
# (in the form of an array of strings) and the second element
|
178
|
+
# is the zero based index indicating the record number within the source
|
179
|
+
# Note that the Enumerator returned may not be rewindable/replayable.
|
180
|
+
def self._make_record_stream(source,format: :csv||:tsv||:guess||nil)
|
181
|
+
col_sep = case format
|
182
|
+
when nil
|
183
|
+
nil
|
184
|
+
when :csv,:tsv
|
185
|
+
SEPARATOR_CHAR_LIST[format]
|
186
|
+
when :guess
|
187
|
+
# in the future, we might be able to guess based on reading
|
188
|
+
# the first line and looking for tabs or commas
|
189
|
+
if source.is_a?(String) && /\.(csv|tsv)$/i.match(source)
|
190
|
+
SEPARATOR_CHAR_LIST[$1.downcase.to_sym]
|
191
|
+
else
|
192
|
+
raise "Format specified as :guess but unable to deduce format"
|
193
|
+
end
|
194
|
+
else
|
195
|
+
raise "Currently unhandled format specifier [#{format}]"
|
196
|
+
end
|
197
|
+
case source
|
198
|
+
in Enumerable if source.first.is_a?(String)
|
199
|
+
# This is the most manual case because of the need to try and detect
|
200
|
+
# lines that are split by a quoted multiline string.
|
201
|
+
return _chunk_lines(source,col_sep)
|
202
|
+
in Enumerable if source.first.is_a?(Array) and source.first.first.is_a?(String)
|
203
|
+
# no need for further processing, assume it already is a record source
|
204
|
+
return source.each_with_index
|
205
|
+
in String if source.include?("\n")
|
206
|
+
# if passed a string, it must be multiline to be treated as the data source
|
207
|
+
return CSV.parse(source,col_sep: col_sep).each_with_index
|
208
|
+
in String if !File.exist?(source)
|
209
|
+
raise "Unable to find the indicated file: #{source}"
|
210
|
+
in String if File.exist?(source) # presumed to be valid filepath
|
211
|
+
return CSV.foreach(source,col_sep: col_sep).each_with_index
|
212
|
+
end #case source
|
213
|
+
raise "Unable to convert the provided source into a record stream"
|
214
|
+
end # self.make_record_stream()
|
215
|
+
|
216
|
+
|
217
|
+
|
218
|
+
# Tests a string to see if the last field looks like it might have a
|
219
|
+
# mutiline field. This is detected by checking for whether the rightmost
|
220
|
+
# field has unbalanced quote characters.
|
221
|
+
# IMPORTANT NOTE: It relies on being told whether the line begins within a
|
222
|
+
# quote
|
223
|
+
# (meaning that a complete line contains at least one unquoted separator)
|
224
|
+
def self._is_dangling?(line,sep_char,started_in_quote,quot_char: '"')
|
225
|
+
qc = Array.new #quote counts
|
226
|
+
qc << (start_in_quot ? 1 : 0)
|
227
|
+
# count quotes in each field to identify unbalanced quotes
|
228
|
+
line.each_char{|c|
|
229
|
+
if c == quot_char
|
230
|
+
qc[-1] += 1
|
231
|
+
elsif c == sep_char && qc.last.even?
|
232
|
+
qc << 0
|
233
|
+
end
|
234
|
+
}
|
235
|
+
return qc.last.odd?
|
236
|
+
end
|
237
|
+
|
238
|
+
|
239
|
+
# bunches up groups of lines when they look like the last column may
|
240
|
+
# contain a multiline value (with embedded carriage return)
|
241
|
+
def self._chunk_lines(line_enum,sep_char,&is_continued)
|
242
|
+
return enum_for(:_chunk_lines,line_enum,sep_char,&is_continued) unless block_given?
|
243
|
+
buf = ""
|
244
|
+
ix = 0 # will be one-based counter
|
245
|
+
is_in_quote = false
|
246
|
+
line_enum.each{|line|
|
247
|
+
if _is_dangling?(line,sep_char,is_in_quote)
|
248
|
+
is_in_quote = true
|
249
|
+
buf.concat(line,line.last == "\n" ? '' : "\n")
|
250
|
+
else
|
251
|
+
is_in_quote = false
|
252
|
+
rec = CSV.parse((buf.empty? ? line : buf.concat(line)),col_sep: sep_char)
|
253
|
+
buf.clear
|
254
|
+
yield [rec,(ix+=1)-1]
|
255
|
+
end
|
256
|
+
}
|
257
|
+
yield [CSV.parse(buf,col_sep: sep_char),(ix+=1)-1] unless buf.empty?
|
258
|
+
return nil #meaningless return value
|
259
|
+
end
|
260
|
+
|
261
|
+
|
262
|
+
# Assuming we are starting from the absolute top of the source, scan
|
263
|
+
# forward looking for the header row according to the provided guidelines.
|
264
|
+
# Note, it may have to read past the header row to ensure it's made the best
|
265
|
+
# possible choice for that header row.
|
266
|
+
#
|
267
|
+
# @param raw_rec_enum [Enumerator] record enumerator for "raw" records such
|
268
|
+
# as is generated by _make_record_stream
|
269
|
+
# @param guidelines [Hash] guidelines package as generated during
|
270
|
+
# instantiation of a class. Note this method is not intended to be called
|
271
|
+
# publicly.
|
272
|
+
# @return [Enumerator] Enumerator that should replace the enumerator passed
|
273
|
+
# in and whose first record is the header row.
|
274
|
+
# Important Note: The position of the raw_rec_enum is almost certain to be
|
275
|
+
# changed by calling next() on it. It should not be used after this
|
276
|
+
# call because of this and other buffer considerations
|
277
|
+
def self._reposition_to_header_rec(raw_rec_enum,guidelines)
|
278
|
+
buffer = Array.new
|
279
|
+
begin
|
280
|
+
loop do
|
281
|
+
buffer << raw_rec_enum.next
|
282
|
+
break if buffer.length > guidelines[:allow_leading_skip_rows]
|
283
|
+
end
|
284
|
+
rescue StopIteration #deliberately a no-op
|
285
|
+
return nil if buffer.empty?
|
286
|
+
end
|
287
|
+
scores = Hash.new{|h,ix| h[ix] = 0} # scoring for possible header row
|
288
|
+
mand_cols = guidelines[:mandatory_headers]
|
289
|
+
buffer.each_with_index{|(rec,_),buf_pos|
|
290
|
+
hdr_range = _calc_header_range(rec,guidelines[:allow_gap_columns]) # best possible header range
|
291
|
+
next if hdr_range.nil?
|
292
|
+
if mand_cols.empty? || (mand_cols - rec[hdr_range]).empty?
|
293
|
+
scores[buf_pos] = 10*(hdr_range.size) + # prefer wide columns
|
294
|
+
(mand_cols.empty? ? 0 : 99000) + # huge bonus for having mandatory columns
|
295
|
+
(1- buf_pos.to_f/buffer.length) # slight preference for early records
|
296
|
+
end
|
297
|
+
# possible other factors for future consideration:
|
298
|
+
# preceding blank line
|
299
|
+
# containing a non-blank row immediately beneath it
|
300
|
+
} # end examination of rows in the buffer
|
301
|
+
best_guess = scores.max{|a,b| a[1] <=> b[1] }
|
302
|
+
if best_guess.nil?
|
303
|
+
raise "Unable to find header record within the first #{[buffer.length,guidelines[:allow_leading_skip_rows]].min} records examined!"
|
304
|
+
end
|
305
|
+
#buffer[best_guess[0]..-1].to_enum + raw_rec_enum # chain enumerators to include buffer
|
306
|
+
return self._joined_enum(buffer,best_guess[0]..buffer.length, raw_rec_enum)
|
307
|
+
end
|
308
|
+
|
309
|
+
# This hack-method was added because for some reason Enumerator::Chain
|
310
|
+
# does not seem to support the next() method as I had to hand-roll
|
311
|
+
# the Chain
|
312
|
+
# return [nil,Enumerator]
|
313
|
+
def self._joined_enum(buffer1,buffer1_range,record_enum)
|
314
|
+
return enum_for(:_joined_enum,buffer1,buffer1_range,record_enum) unless block_given?
|
315
|
+
buffer1_range.each{|ix| yield buffer1[ix] }
|
316
|
+
buffer1 = nil # release (in case it matters)
|
317
|
+
begin
|
318
|
+
loop do
|
319
|
+
yield record_enum.next
|
320
|
+
end
|
321
|
+
rescue StopIteration #deliberately a no-op
|
322
|
+
end
|
323
|
+
return nil # meaningless return value
|
324
|
+
end
|
325
|
+
|
326
|
+
end #class TabularRecordSource
|
327
|
+
|
328
|
+
|
329
|
+
end #module Nodepile
|