nodepile 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rspec +2 -0
- data/.rubocop.yml +1 -1
- data/BACKLOG.md +34 -0
- data/Rakefile +92 -2
- data/lib/nodepile/base_structs.rb +62 -0
- data/lib/nodepile/colspecs.rb +562 -0
- data/lib/nodepile/gross_actions.rb +38 -0
- data/lib/nodepile/gviz.rb +108 -0
- data/lib/nodepile/keyed_array.rb +386 -0
- data/lib/nodepile/pile_organizer.rb +258 -0
- data/lib/nodepile/pragmas.rb +97 -0
- data/lib/nodepile/rec_source.rb +329 -0
- data/lib/nodepile/rule_eval.rb +155 -0
- data/lib/nodepile/version.rb +1 -1
- data/nodepile.gemspec +53 -0
- data/tmp/.gitignore +1 -0
- metadata +136 -19
@@ -0,0 +1,258 @@
|
|
1
|
+
require 'nodepile/colspecs.rb'
|
2
|
+
require 'nodepile/rec_source.rb'
|
3
|
+
require 'nodepile/pragmas.rb'
|
4
|
+
require 'nodepile/rule_eval.rb'
|
5
|
+
|
6
|
+
module Nodepile
|
7
|
+
|
8
|
+
# Container class for managing a Nodepile. A nodepile consists of a set of
|
9
|
+
# entities including nodes, edges, and rules. It includes methods for
|
10
|
+
# enumerating the various items in the collection, filtering, and
|
11
|
+
# deducing the existence of implied edges using rules.
|
12
|
+
class PileOrganizer
|
13
|
+
|
14
|
+
# see nodepile/base_structs.rb for definition of an Nodepile::EntityPacket
|
15
|
+
class ERecStack; end #ERecStack defined further down
|
16
|
+
class RuleCache; end # defined further down
|
17
|
+
SourceData = Struct.new(:source_name,:highest_sequence_num)
|
18
|
+
|
19
|
+
|
20
|
+
def initialize()
|
21
|
+
@nodes = Hash.new{|h,k| h[k] = ERecStack.new}
|
22
|
+
@edges = Hash.new{|h,k| h[k] = ERecStack.new}
|
23
|
+
@rules = Array.new # not subject to overlaying with themselves
|
24
|
+
@pragmas = Nodepile::Pragmas.new
|
25
|
+
@sources = Hash.new{|h,k| h[k] = SourceData.new(k,0)} # insert a dummy source for unspecifie
|
26
|
+
@last_source_name = nil
|
27
|
+
@dirty = true
|
28
|
+
end
|
29
|
+
|
30
|
+
# If a source name is not specified, then the source is assumed to be
|
31
|
+
# the last source that was used to append. If no sequence number is provided
|
32
|
+
# then the sequence_number is assumed to be one more than the highest
|
33
|
+
# sequence number that was specified. If callers are manually specifying
|
34
|
+
# sequence numbers for a source, they should do so consistently to avoid
|
35
|
+
# repeats.
|
36
|
+
# @param kaa [KeyedArrayAccessor] Includes metadata about @type, @key, @is_implied
|
37
|
+
# @return [self]
|
38
|
+
def append(kaa)
|
39
|
+
@last_source_name = kaa.source || @last_source_name
|
40
|
+
source_data = @sources[@last_source_name]
|
41
|
+
# note that the way things work below deliberately "overlays" items
|
42
|
+
# when a matching key is encountered. Rule recalculation is deferred
|
43
|
+
case kaa['@type']
|
44
|
+
when :node
|
45
|
+
@nodes[kaa['@key']] << kaa
|
46
|
+
when :edge
|
47
|
+
@edges[kaa['@key']] << kaa
|
48
|
+
when :rule
|
49
|
+
@rules << RuleCache.new(kaa)
|
50
|
+
when :pragma
|
51
|
+
@pragmas.parse(kaa['_id'])
|
52
|
+
else
|
53
|
+
raise "Unhandled entity entity type #{kaa['@type'].inspect}"
|
54
|
+
end #case
|
55
|
+
return self
|
56
|
+
end
|
57
|
+
|
58
|
+
def node_count() = @nodes.length
|
59
|
+
def rule_count() = @rules.length
|
60
|
+
def edge_count() = @edges.length
|
61
|
+
def pragmas() = @pragmas
|
62
|
+
|
63
|
+
def entity_record(key)
|
64
|
+
_update_rule_impacts
|
65
|
+
case key
|
66
|
+
when String
|
67
|
+
return @nodes[key]
|
68
|
+
when Array
|
69
|
+
return @edges[key]
|
70
|
+
else
|
71
|
+
raise "Unrecognized key structure/type"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
# Provide summarized records in order
|
76
|
+
def edge_records
|
77
|
+
return enum_for(:edge_records) unless block_given?
|
78
|
+
_update_rule_impacts
|
79
|
+
@edges.each_value{|erstack| yield(erstack.summary) }
|
80
|
+
end
|
81
|
+
|
82
|
+
# Provide the summarized records
|
83
|
+
def node_records
|
84
|
+
return enum_for(:node_records) unless block_given?
|
85
|
+
_update_rule_impacts
|
86
|
+
@nodes.each_value{|erstack| yield(erstack.summary) }
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
# Alias for #append()
|
91
|
+
# @param entity_record [Nodepile::KeyedArrayAccessor]
|
92
|
+
def <<(entity_record) = append(entity_record)
|
93
|
+
|
94
|
+
# Loads the given file (on top of anything already stored in this object)
|
95
|
+
def load_from_file(tabular_filepath)
|
96
|
+
source = Nodepile::TabularRecordSource.new(tabular_filepath,format: :guess)
|
97
|
+
specs = nil
|
98
|
+
loaded_entity_count = 0
|
99
|
+
rec_src_meta = {'path' => tabular_filepath,'rec_num' => nil}
|
100
|
+
metadata = Hash.new
|
101
|
+
source.each{|(rec,rec_num)|
|
102
|
+
rec_src_meta['rec_num'] = rec_num
|
103
|
+
if specs.nil? #first row is header
|
104
|
+
specs = Nodepile::InputColumnSpecs.new(rec,metadata_key_prefix: '@')
|
105
|
+
else
|
106
|
+
begin
|
107
|
+
specs.parse(rec,source: tabular_filepath,
|
108
|
+
ref_num: rec_num,
|
109
|
+
metadata: metadata,
|
110
|
+
){|keyed_array_accessor|
|
111
|
+
append(keyed_array_accessor)
|
112
|
+
loaded_entity_count += 1
|
113
|
+
}
|
114
|
+
rescue Nodepile::InputColumnSpecs::InvalidRecordError => err
|
115
|
+
# re-raise but add info about the record number that triggered the error
|
116
|
+
err.rec_num = rec_num
|
117
|
+
err.file_path = tabular_filepath
|
118
|
+
raise # re-raise
|
119
|
+
end
|
120
|
+
end #if
|
121
|
+
}
|
122
|
+
return loaded_entity_count
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
private
|
127
|
+
# This sledgehammer approach deletes all calculated impacts that may
|
128
|
+
# have previously been applied to the @edges and @nodes and recalculates
|
129
|
+
# all of them.
|
130
|
+
#
|
131
|
+
# Method is a no-op if the structrues are up-to-date
|
132
|
+
def _update_rule_impacts(force = false)
|
133
|
+
return nil unless force || @dirty
|
134
|
+
@nodes.each_value(&:purge_rule_overlays)
|
135
|
+
@edges.each_value(&:purge_rule_overlays)
|
136
|
+
@rules.each{|rulecache|
|
137
|
+
recs = (rulecache.relevant_entity_type == :node ? @nodes : @edges).each_value
|
138
|
+
recs.each{|erstack|
|
139
|
+
if rulecache.match?(erstack.summary)
|
140
|
+
# calculate the rule as applied to the given node/edge
|
141
|
+
calculated_rule_erec = rulecache.eval_using(erstack.summary)
|
142
|
+
erstack << calculated_rule_erec
|
143
|
+
end
|
144
|
+
}
|
145
|
+
} #loop over rules
|
146
|
+
@dirty = false
|
147
|
+
end
|
148
|
+
|
149
|
+
# An ERecStack is a data structure used for holding and summarizing
|
150
|
+
# overlay-able records related to a given Node or Edge which can include
|
151
|
+
# "rules" that apply to that node/stack
|
152
|
+
class ERecStack
|
153
|
+
def initialize()
|
154
|
+
@a = Array.new
|
155
|
+
@summary = nil
|
156
|
+
@mc = CrudeCalculationCache.new
|
157
|
+
end
|
158
|
+
|
159
|
+
def inspect
|
160
|
+
"#<#{self.class}:0x#{object_id} type= #{type} key=#{self.key.inspect} depth=#{@a.length}> "
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
def type = @a.first['@type']
|
165
|
+
def key() = @a.first['@key']
|
166
|
+
|
167
|
+
def is_node? = self.type == :node
|
168
|
+
def is_edge? = self.type == :edge
|
169
|
+
def summary() = @summary
|
170
|
+
def to_a = @a
|
171
|
+
|
172
|
+
# A stack of type :node or :edge is implied if it contains
|
173
|
+
# no ERec records where the is_implied attribute is false.
|
174
|
+
# The return value of this method is undefined for types
|
175
|
+
def is_implied
|
176
|
+
@a.each{|kaa| return false if !kaa['@is_implied'] &&
|
177
|
+
[:node,:edge].include?(kaa['@type']) }
|
178
|
+
return true
|
179
|
+
end
|
180
|
+
|
181
|
+
# Delete overlayed rule records
|
182
|
+
# @return [void]
|
183
|
+
def purge_rule_overlays()
|
184
|
+
@a.delete_if{|rec| rec['@type'] == :rule}
|
185
|
+
@a.each{|rec| @summary = self.class._update_summary(@summary,rec)} #recalc
|
186
|
+
end
|
187
|
+
|
188
|
+
|
189
|
+
# Note that this method does not verify whether it is appropriate to stack the new
|
190
|
+
# record and assumes callers have alreay done this due-diligence.
|
191
|
+
# @param erec [KeyedArrayAccessor]
|
192
|
+
def <<(rec)
|
193
|
+
raise "ERecStack may only hold objects of type Nodepile::KeyedArrayAccessor" unless rec.is_a?(Nodepile::KeyedArrayAccessor)
|
194
|
+
# Keep the summary up-to-date if we've got one.
|
195
|
+
@a << rec
|
196
|
+
if @a.length == 1
|
197
|
+
@summary = rec.dup
|
198
|
+
else
|
199
|
+
@summary = self.class._update_summary(@summary,rec)
|
200
|
+
end
|
201
|
+
return self
|
202
|
+
end
|
203
|
+
|
204
|
+
def each_keyed_array()
|
205
|
+
return enum_for(:each_keyed_array) unless block_given?
|
206
|
+
@a.each{|erec| yield erec }
|
207
|
+
end
|
208
|
+
|
209
|
+
private
|
210
|
+
def self._update_summary(cur_summary,new_overlay)
|
211
|
+
return new_overlay if cur_summary.nil?
|
212
|
+
cur_summary.underlay!(new_overlay)
|
213
|
+
cur_summary.source = nil if cur_summary.source != new_overlay.source
|
214
|
+
cur_summary.ref_num = nil #summary no longer represents a single ref_num
|
215
|
+
cur_summary.update_metadata('@is_implied',false) unless new_overlay['@is_implied']
|
216
|
+
return cur_summary
|
217
|
+
end
|
218
|
+
|
219
|
+
end #class Nodepile::PileOrganizer::ERecStack
|
220
|
+
|
221
|
+
# Represents cached information about a single specific rule
|
222
|
+
class RuleCache
|
223
|
+
|
224
|
+
# @param rule_erec[KeyedArrayAccessor] created from this rule record
|
225
|
+
def initialize(rule_erec)
|
226
|
+
@er = rule_erec.freeze #just for to avoid casual alteration
|
227
|
+
raise "Only ERecs of type :rule may be stored in this structure" unless self.type == :rule
|
228
|
+
@verifiers = [*self.key].map{|s| Nodepile::InputColumnSpecs.make_pattern_match_verifier(s)}
|
229
|
+
@rule_eval = RuleRecordEvaluator.new(@er)
|
230
|
+
end
|
231
|
+
|
232
|
+
def inspect
|
233
|
+
"#<#{self.class}:0x#{object_id} key=#{self.key.inspect} > "
|
234
|
+
end
|
235
|
+
|
236
|
+
def type = @er['@type']
|
237
|
+
def is_implied = @er['@is_implied']
|
238
|
+
def key = @er['@key']
|
239
|
+
|
240
|
+
# Note that a rule that uses dynamic matching cannot precalculate which
|
241
|
+
# records it matches and must (to be safe) reclculate the match in response
|
242
|
+
# to any changes
|
243
|
+
def uses_dynamic_match? = @rule_eval.uses_dynamic_match?
|
244
|
+
|
245
|
+
# @param kaa [KeyedArrayAccessor] a set of field values that will
|
246
|
+
# tested against this rule for matching.
|
247
|
+
def match?(kaa) = @rule_eval.match_record?(kaa)
|
248
|
+
def eval_using(kaa) = @rule_eval.calculate_rule(kaa)
|
249
|
+
|
250
|
+
|
251
|
+
def relevant_entity_type = @er['@key'].is_a?(String) ? :node : :edge
|
252
|
+
|
253
|
+
end #class Nodepile::PileOrganizer::RuleCache
|
254
|
+
|
255
|
+
end #class PileOrganizer
|
256
|
+
|
257
|
+
|
258
|
+
end #module Nodepile
|
@@ -0,0 +1,97 @@
|
|
1
|
+
|
2
|
+
|
3
|
+
module Nodepile
|
4
|
+
|
5
|
+
# Pragmas is a parser and organizer class used to interpret the meaning
|
6
|
+
# of pragmas that may appear in source files. By default, pragmas are
|
7
|
+
# coded instructions for rendering, parsing, or layout that may be embedded
|
8
|
+
# in input files. They are often instructions stored in the "_id" field
|
9
|
+
# of an input file that begin with a specific indicating string '#pragma '
|
10
|
+
# Pragmas may be used to control things like the specific layout engine that
|
11
|
+
# is used to visualize a graph (e.g. dot versus neato)
|
12
|
+
# Example pragma lines are.
|
13
|
+
#
|
14
|
+
# #pragma neato
|
15
|
+
# #pragma unflatten
|
16
|
+
#
|
17
|
+
# Create an instance of a Nodepile::Pragmas object in order to track and
|
18
|
+
# interpret the collective pragmas of a given graph. Note that the most
|
19
|
+
# common rule in #pragma interpretation is that if two pragmas contradict each
|
20
|
+
# other, the furthest down in the file/parsing stream dominates.
|
21
|
+
class Pragmas
|
22
|
+
DEFAULT_PRAGMA_MARKER = "#pragma "
|
23
|
+
|
24
|
+
@@indicator_patterns = Array.new # array of [pragma_sym,regexp]
|
25
|
+
|
26
|
+
def initialize(pragma_marker: DEFAULT_PRAGMA_MARKER)
|
27
|
+
@marker = pragma_marker.freeze
|
28
|
+
@indicators = Hash.new #name mapped to value
|
29
|
+
# if you make this method more complex, remember to update #dup
|
30
|
+
end
|
31
|
+
|
32
|
+
def dup
|
33
|
+
c = self.class.new(pragma_marker: @marker)
|
34
|
+
c._indicators.merge!(@indicators)
|
35
|
+
return c
|
36
|
+
end
|
37
|
+
|
38
|
+
def [](pragma_sym)
|
39
|
+
@indicators[pragma_sym]
|
40
|
+
end
|
41
|
+
|
42
|
+
# @yield [pragma_sym,pragma_val] provides pairs of name values as they have
|
43
|
+
# been set. Only set values will appear in the yielded set.
|
44
|
+
def each_setting_pair
|
45
|
+
return enum_for(__method__) unless block_given
|
46
|
+
@indicators.each_pair{|k,v| yield(k,v) }
|
47
|
+
return @indicators.length
|
48
|
+
end
|
49
|
+
|
50
|
+
# Parse the given pragma and store the meaning for access via square bracket
|
51
|
+
# method or the #each_setting_pair method.
|
52
|
+
# @return [void]
|
53
|
+
def parse(pragma_string)
|
54
|
+
raise "Expecting pragma_string to start with [#{@marker}]" unless pragma_string.start_with?(@marker)
|
55
|
+
|
56
|
+
#TODO: I there are more complicated parsing rules, they should go before
|
57
|
+
# the simple fallthrough case of whitespace separated indicators
|
58
|
+
|
59
|
+
# Simple indicators are single "word" values where each value is delimited
|
60
|
+
# by whitespace. If two indicators apply to the same pragma_sym,
|
61
|
+
pragma_string[@marker.length..-1].split(/\s+/).each{|s|
|
62
|
+
prag_sym,_ = @@indicator_patterns.find{|(prag_sym,rx)| rx.match(s) }
|
63
|
+
if prag_sym
|
64
|
+
@indicators[prag_sym] = $1
|
65
|
+
else
|
66
|
+
raise "Unrecognized pragma encountered [#{s}]"
|
67
|
+
end
|
68
|
+
}
|
69
|
+
return nil
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
# Declare simple indicators are pragmas whose presence or absence is the indicator.
|
75
|
+
# such indicators can be stacked (multiple per pragma expression)
|
76
|
+
# For example:
|
77
|
+
# #pragma neato unflatten
|
78
|
+
#
|
79
|
+
# The above line would mean that two sepearate effects were being invoked,
|
80
|
+
# the use of the "neato" rendering engine and the use of the unflatten
|
81
|
+
# setting to improve the aspect ratio of some directed graphs
|
82
|
+
def self._decl_simple_indicator(prag_name,alt_regexp)
|
83
|
+
@@indicator_patterns<< [prag_name,alt_regexp].freeze
|
84
|
+
end
|
85
|
+
|
86
|
+
_decl_simple_indicator(:layout_engine,/^(dot|neato|fdp|sfdp|circo|twopi|nop2|osage|patchwork)$/)
|
87
|
+
_decl_simple_indicator(:directionality,/^(graph|digraph)$/)
|
88
|
+
#_decl_simple_indicators(:unflatten,/^(unflatten)$/) # not sure this is supported
|
89
|
+
|
90
|
+
# Crude accessor (protected) for duplication and merge
|
91
|
+
def _indicators = @indicator
|
92
|
+
|
93
|
+
|
94
|
+
end #class Nodepile::Pragmas
|
95
|
+
|
96
|
+
|
97
|
+
end #module Nodepile
|
@@ -0,0 +1,329 @@
|
|
1
|
+
require 'stringio'
|
2
|
+
|
3
|
+
module Nodepile
|
4
|
+
|
5
|
+
# Generates "Factories" for harvesting tabular data from a source stream/file.
|
6
|
+
# Includes facilities for parsing common file formats (CSV/TSV).
|
7
|
+
# Includes facilities for handling common problems encountered when parsing
|
8
|
+
# manually-created tabular data files such as: relevant tabular data is not
|
9
|
+
# aligned "top-left", tabular data includes blank or repeated columns,
|
10
|
+
# tabular data ends before end of file
|
11
|
+
# summary rows appear in the tabular data that need to be ignored.
|
12
|
+
class TabularRecordSource
|
13
|
+
include Enumerable
|
14
|
+
|
15
|
+
DEFAULT_LOADING_GUIDELINES = {
|
16
|
+
mandatory_headers: [], # this can be extremely important to correctly finding tables
|
17
|
+
format: :csv||:tsv||:guess, #assume CSV unless told otherwise
|
18
|
+
allow_leading_skip_rows: 10, # arbitrary content that may appear before table
|
19
|
+
allow_gap_rows: 2||nil, # entirely blank rows appearing mid-tabl, nil indicates allow infinite
|
20
|
+
allow_gap_columns: 1, # columns which have a blank header within the table
|
21
|
+
allow_left_offset: 5, # blank columnns allowed left of table
|
22
|
+
duplicate_header_rule: :first||:last||:ignore||:rename||:fail, #keep the first
|
23
|
+
ignored_header_char: '#', # header names starting with this are just plain ignored
|
24
|
+
emit_blank_records: false, # unless true, entirely blank records are not returned
|
25
|
+
trim_headers: true, #strip leading and trailing spaces
|
26
|
+
}.freeze
|
27
|
+
|
28
|
+
# Create a new RecordSource intended to read from the specified input
|
29
|
+
# and using the parsing strategy specified by the loading guidelines.
|
30
|
+
def initialize(source,**loading_guidelines)
|
31
|
+
(loading_guidelines.keys - DEFAULT_LOADING_GUIDELINES.keys).tap{|x| raise <<~ERRMSG unless x.empty?}
|
32
|
+
Unrecognized named parameters used for RecordSource creation #{x.inspect}
|
33
|
+
ERRMSG
|
34
|
+
@loading_guidelines = DEFAULT_LOADING_GUIDELINES.merge(loading_guidelines).freeze
|
35
|
+
raise "The source must be non-nil" if source.nil?
|
36
|
+
@source = source # will lazy load
|
37
|
+
@is_mid_read = false # only relevant for non-parallel sources
|
38
|
+
@replayable_flag = if @source.is_a?(String)
|
39
|
+
:parallel # simultaneous each() is okay
|
40
|
+
elsif @source.respond_to?(:rewind)
|
41
|
+
:single # can't guarantee simultaneous each() safe
|
42
|
+
else
|
43
|
+
nil
|
44
|
+
end
|
45
|
+
end #new
|
46
|
+
|
47
|
+
|
48
|
+
# Yields the "records" of the first "table" encountered in the bound data
|
49
|
+
# source according to the parameters it was given. First row yielded is
|
50
|
+
# always the header. Raises an error if a header is not found.
|
51
|
+
# Beware... depending on the type of data source used at creation, it
|
52
|
+
# may not be possible to rewind or retrieve data in parallel.
|
53
|
+
# With that said, a filename or String both allow parallel retrieval.
|
54
|
+
#
|
55
|
+
# Also note that blank strings will be passed through until the specified
|
56
|
+
# allow_gap_rows are exceeded. This can mean trailing blanks in long files.
|
57
|
+
#
|
58
|
+
# @yieldparam [Array] Array includes at least two elements. The first is
|
59
|
+
# an Array of "fields". The second element is the record
|
60
|
+
# number within the source (zero index). It's important
|
61
|
+
# to note if any field contains embedded newlines, the record
|
62
|
+
# number is not the same as the line number
|
63
|
+
# @return [Integer,Enumerator] Returns enumerator if no block is given.
|
64
|
+
# Otherwise returns the count of records yielded excluding
|
65
|
+
# the header line.
|
66
|
+
def each(&block)
|
67
|
+
return enum_for(:each) unless block_given?
|
68
|
+
raise "This data source type may only be read once." if @source.nil?
|
69
|
+
raise <<~ERRMSG if @is_mid_read && @replayable_flag != :parallel
|
70
|
+
For this type of data source, you may not read simultaneously.
|
71
|
+
ERRMSG
|
72
|
+
@is_mid_read = true
|
73
|
+
scanner = self.class._make_record_stream(@source,format: @loading_guidelines[:format])
|
74
|
+
scanner = self.class._reposition_to_header_rec(scanner,@loading_guidelines)
|
75
|
+
raw_header,header_pos = scanner.next
|
76
|
+
header_range = self.class._calc_header_range(raw_header,@loading_guidelines[:allow_gap_columns])
|
77
|
+
# process the header line to create a "mask"
|
78
|
+
yield [raw_header[header_range],header_pos] # return the trimmed header
|
79
|
+
rec_count = self.class._emit_rows(scanner,header_range,
|
80
|
+
@loading_guidelines[:emit_blank_records],
|
81
|
+
trim_headers: @loading_guidelines[:trim_headers],
|
82
|
+
tolerate_blanks: @loading_guidelines[:allow_gap_rows],
|
83
|
+
&block
|
84
|
+
)
|
85
|
+
@is_mid_read = false
|
86
|
+
@source = nil if @replayable_flag.nil? # release resources
|
87
|
+
return rec_count
|
88
|
+
end #each
|
89
|
+
|
90
|
+
|
91
|
+
###########################################################################
|
92
|
+
private
|
93
|
+
|
94
|
+
SEPARATOR_CHAR_LIST = {tsv: "\t", csv: ','}.freeze
|
95
|
+
|
96
|
+
# Note, due to the need to terminate if too many blanks, this may not read
|
97
|
+
# to the end of the file
|
98
|
+
def self._emit_rows(raw_rec_enum,range_mask,emit_blank_records,trim_headers:,
|
99
|
+
tolerate_blanks: nil,
|
100
|
+
&block
|
101
|
+
)
|
102
|
+
contig_blank_count = 0
|
103
|
+
emitted_record_count = 0
|
104
|
+
need_to_trim_row = trim_headers # trim the first one
|
105
|
+
loop do
|
106
|
+
begin
|
107
|
+
rec,pos = raw_rec_enum.next
|
108
|
+
masked_rec = rec&.[](range_mask)
|
109
|
+
next if masked_rec.nil?
|
110
|
+
is_blank_record = masked_rec.all?{|s| s.nil? || /^\s*$/.match?(s)}
|
111
|
+
contig_blank_count = is_blank_record ? (contig_blank_count+1) : 0
|
112
|
+
if tolerate_blanks && contig_blank_count > tolerate_blanks
|
113
|
+
return emitted_record_count # end of records
|
114
|
+
else
|
115
|
+
if emit_blank_records || !is_blank_record
|
116
|
+
if need_to_trim_row # only done once (for header) if at all
|
117
|
+
masked_rec.map!{|s| s&.strip}
|
118
|
+
need_to_trim_row = false # only first emitted row
|
119
|
+
end
|
120
|
+
yield [masked_rec,pos]
|
121
|
+
emitted_record_count += 1
|
122
|
+
end
|
123
|
+
end
|
124
|
+
rescue StopIteration
|
125
|
+
return emitted_record_count # running out of records is an okay end
|
126
|
+
end # rescuing
|
127
|
+
end #loop over records
|
128
|
+
raise "Unexpected issue encountered." # should never get here
|
129
|
+
end # self._emit_rows()
|
130
|
+
|
131
|
+
MIN_NONBLANK_HEADER_BEFORE_GAP_ALLOWED = 3
|
132
|
+
|
133
|
+
# Given a presumed header, identify the position of the largest contiguous
|
134
|
+
# block of non-blank fields and return the range information to be used
|
135
|
+
# for scraping successive rows.
|
136
|
+
def self._calc_header_range(raw_header_record,max_blank_cols)
|
137
|
+
max_blank_cols ||= 0 # default is no blank columns tolerated in header
|
138
|
+
blank_tol = max_blank_cols
|
139
|
+
ix0 = nil
|
140
|
+
runs = Array.new
|
141
|
+
(0..(raw_header_record.length-1)).each{|ix|
|
142
|
+
if raw_header_record[ix].nil? || /^\s*$/.match?(raw_header_record[ix])
|
143
|
+
if ix0.nil?
|
144
|
+
# deliberate no-op, in middle of blank run
|
145
|
+
elsif blank_tol >= 0 && ix-ix0 >= MIN_NONBLANK_HEADER_BEFORE_GAP_ALLOWED
|
146
|
+
# at least two content-filled columns must be found before tolerating blanks
|
147
|
+
blank_tol -= 1
|
148
|
+
else
|
149
|
+
runs << [ix0,ix-1]
|
150
|
+
ix0 = nil
|
151
|
+
blank_tol = max_blank_cols
|
152
|
+
end
|
153
|
+
else # non blank
|
154
|
+
ix0 ||= ix # record start of run
|
155
|
+
blank_tol = max_blank_cols # reset tolerance for blanks
|
156
|
+
end
|
157
|
+
}
|
158
|
+
runs << [ix0,raw_header_record.length-1-(max_blank_cols-blank_tol)] if ix0
|
159
|
+
widest = runs.max{|a,b| a[1]-a[0] <=> b[1]-b[0] }
|
160
|
+
return widest && (widest[0]..widest[1]) # range defines
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
# Opens up a record stream based on the source provided and the format rule
|
165
|
+
# specified.
|
166
|
+
# @param source [] Many different values are possible
|
167
|
+
# 1) a filepath to a readable text file
|
168
|
+
# 2) a string variable (must contain at least one newline)
|
169
|
+
# 3) An enumerable of strings where each string is a record "line" to be parsed individually
|
170
|
+
# 4) An enumerable of arrays of strings (where the inner array is the column values).
|
171
|
+
# In this case the format parameter is ignored
|
172
|
+
# @param format [:csv,:tsv,:guess,Regexp] Indicates how to interpret column delimiters
|
173
|
+
# and row delimiters. The format parameter is ignored if the source
|
174
|
+
# is an enumerable of
|
175
|
+
# @return [Enumerator<Array>] Whose next() method returns two-element arrays
|
176
|
+
# Where the first element is the fields of the record
|
177
|
+
# (in the form of an array of strings) and the second element
|
178
|
+
# is the zero based index indicating the record number within the source
|
179
|
+
# Note that the Enumerator returned may not be rewindable/replayable.
|
180
|
+
def self._make_record_stream(source,format: :csv||:tsv||:guess||nil)
|
181
|
+
col_sep = case format
|
182
|
+
when nil
|
183
|
+
nil
|
184
|
+
when :csv,:tsv
|
185
|
+
SEPARATOR_CHAR_LIST[format]
|
186
|
+
when :guess
|
187
|
+
# in the future, we might be able to guess based on reading
|
188
|
+
# the first line and looking for tabs or commas
|
189
|
+
if source.is_a?(String) && /\.(csv|tsv)$/i.match(source)
|
190
|
+
SEPARATOR_CHAR_LIST[$1.downcase.to_sym]
|
191
|
+
else
|
192
|
+
raise "Format specified as :guess but unable to deduce format"
|
193
|
+
end
|
194
|
+
else
|
195
|
+
raise "Currently unhandled format specifier [#{format}]"
|
196
|
+
end
|
197
|
+
case source
|
198
|
+
in Enumerable if source.first.is_a?(String)
|
199
|
+
# This is the most manual case because of the need to try and detect
|
200
|
+
# lines that are split by a quoted multiline string.
|
201
|
+
return _chunk_lines(source,col_sep)
|
202
|
+
in Enumerable if source.first.is_a?(Array) and source.first.first.is_a?(String)
|
203
|
+
# no need for further processing, assume it already is a record source
|
204
|
+
return source.each_with_index
|
205
|
+
in String if source.include?("\n")
|
206
|
+
# if passed a string, it must be multiline to be treated as the data source
|
207
|
+
return CSV.parse(source,col_sep: col_sep).each_with_index
|
208
|
+
in String if !File.exist?(source)
|
209
|
+
raise "Unable to find the indicated file: #{source}"
|
210
|
+
in String if File.exist?(source) # presumed to be valid filepath
|
211
|
+
return CSV.foreach(source,col_sep: col_sep).each_with_index
|
212
|
+
end #case source
|
213
|
+
raise "Unable to convert the provided source into a record stream"
|
214
|
+
end # self.make_record_stream()
|
215
|
+
|
216
|
+
|
217
|
+
|
218
|
+
# Tests a string to see if the last field looks like it might have a
|
219
|
+
# mutiline field. This is detected by checking for whether the rightmost
|
220
|
+
# field has unbalanced quote characters.
|
221
|
+
# IMPORTANT NOTE: It relies on being told whether the line begins within a
|
222
|
+
# quote
|
223
|
+
# (meaning that a complete line contains at least one unquoted separator)
|
224
|
+
def self._is_dangling?(line,sep_char,started_in_quote,quot_char: '"')
|
225
|
+
qc = Array.new #quote counts
|
226
|
+
qc << (start_in_quot ? 1 : 0)
|
227
|
+
# count quotes in each field to identify unbalanced quotes
|
228
|
+
line.each_char{|c|
|
229
|
+
if c == quot_char
|
230
|
+
qc[-1] += 1
|
231
|
+
elsif c == sep_char && qc.last.even?
|
232
|
+
qc << 0
|
233
|
+
end
|
234
|
+
}
|
235
|
+
return qc.last.odd?
|
236
|
+
end
|
237
|
+
|
238
|
+
|
239
|
+
# bunches up groups of lines when they look like the last column may
|
240
|
+
# contain a multiline value (with embedded carriage return)
|
241
|
+
def self._chunk_lines(line_enum,sep_char,&is_continued)
|
242
|
+
return enum_for(:_chunk_lines,line_enum,sep_char,&is_continued) unless block_given?
|
243
|
+
buf = ""
|
244
|
+
ix = 0 # will be one-based counter
|
245
|
+
is_in_quote = false
|
246
|
+
line_enum.each{|line|
|
247
|
+
if _is_dangling?(line,sep_char,is_in_quote)
|
248
|
+
is_in_quote = true
|
249
|
+
buf.concat(line,line.last == "\n" ? '' : "\n")
|
250
|
+
else
|
251
|
+
is_in_quote = false
|
252
|
+
rec = CSV.parse((buf.empty? ? line : buf.concat(line)),col_sep: sep_char)
|
253
|
+
buf.clear
|
254
|
+
yield [rec,(ix+=1)-1]
|
255
|
+
end
|
256
|
+
}
|
257
|
+
yield [CSV.parse(buf,col_sep: sep_char),(ix+=1)-1] unless buf.empty?
|
258
|
+
return nil #meaningless return value
|
259
|
+
end
|
260
|
+
|
261
|
+
|
262
|
+
# Assuming we are starting from the absolute top of the source, scan
|
263
|
+
# forward looking for the header row according to the provided guidelines.
|
264
|
+
# Note, it may have to read past the header row to ensure it's made the best
|
265
|
+
# possible choice for that header row.
|
266
|
+
#
|
267
|
+
# @param raw_rec_enum [Enumerator] record enumerator for "raw" records such
|
268
|
+
# as is generated by _make_record_stream
|
269
|
+
# @param guidelines [Hash] guidelines package as generated during
|
270
|
+
# instantiation of a class. Note this method is not intended to be called
|
271
|
+
# publicly.
|
272
|
+
# @return [Enumerator] Enumerator that should replace the enumerator passed
|
273
|
+
# in and whose first record is the header row.
|
274
|
+
# Important Note: The position of the raw_rec_enum is almost certain to be
|
275
|
+
# changed by calling next() on it. It should not be used after this
|
276
|
+
# call because of this and other buffer considerations
|
277
|
+
def self._reposition_to_header_rec(raw_rec_enum,guidelines)
|
278
|
+
buffer = Array.new
|
279
|
+
begin
|
280
|
+
loop do
|
281
|
+
buffer << raw_rec_enum.next
|
282
|
+
break if buffer.length > guidelines[:allow_leading_skip_rows]
|
283
|
+
end
|
284
|
+
rescue StopIteration #deliberately a no-op
|
285
|
+
return nil if buffer.empty?
|
286
|
+
end
|
287
|
+
scores = Hash.new{|h,ix| h[ix] = 0} # scoring for possible header row
|
288
|
+
mand_cols = guidelines[:mandatory_headers]
|
289
|
+
buffer.each_with_index{|(rec,_),buf_pos|
|
290
|
+
hdr_range = _calc_header_range(rec,guidelines[:allow_gap_columns]) # best possible header range
|
291
|
+
next if hdr_range.nil?
|
292
|
+
if mand_cols.empty? || (mand_cols - rec[hdr_range]).empty?
|
293
|
+
scores[buf_pos] = 10*(hdr_range.size) + # prefer wide columns
|
294
|
+
(mand_cols.empty? ? 0 : 99000) + # huge bonus for having mandatory columns
|
295
|
+
(1- buf_pos.to_f/buffer.length) # slight preference for early records
|
296
|
+
end
|
297
|
+
# possible other factors for future consideration:
|
298
|
+
# preceding blank line
|
299
|
+
# containing a non-blank row immediately beneath it
|
300
|
+
} # end examination of rows in the buffer
|
301
|
+
best_guess = scores.max{|a,b| a[1] <=> b[1] }
|
302
|
+
if best_guess.nil?
|
303
|
+
raise "Unable to find header record within the first #{[buffer.length,guidelines[:allow_leading_skip_rows]].min} records examined!"
|
304
|
+
end
|
305
|
+
#buffer[best_guess[0]..-1].to_enum + raw_rec_enum # chain enumerators to include buffer
|
306
|
+
return self._joined_enum(buffer,best_guess[0]..buffer.length, raw_rec_enum)
|
307
|
+
end
|
308
|
+
|
309
|
+
# This hack-method was added because for some reason Enumerator::Chain
|
310
|
+
# does not seem to support the next() method as I had to hand-roll
|
311
|
+
# the Chain
|
312
|
+
# return [nil,Enumerator]
|
313
|
+
def self._joined_enum(buffer1,buffer1_range,record_enum)
|
314
|
+
return enum_for(:_joined_enum,buffer1,buffer1_range,record_enum) unless block_given?
|
315
|
+
buffer1_range.each{|ix| yield buffer1[ix] }
|
316
|
+
buffer1 = nil # release (in case it matters)
|
317
|
+
begin
|
318
|
+
loop do
|
319
|
+
yield record_enum.next
|
320
|
+
end
|
321
|
+
rescue StopIteration #deliberately a no-op
|
322
|
+
end
|
323
|
+
return nil # meaningless return value
|
324
|
+
end
|
325
|
+
|
326
|
+
end #class TabularRecordSource
|
327
|
+
|
328
|
+
|
329
|
+
end #module Nodepile
|