nodepile 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,258 @@
1
+ require 'nodepile/colspecs.rb'
2
+ require 'nodepile/rec_source.rb'
3
+ require 'nodepile/pragmas.rb'
4
+ require 'nodepile/rule_eval.rb'
5
+
6
+ module Nodepile
7
+
8
+ # Container class for managing a Nodepile. A nodepile consists of a set of
9
+ # entities including nodes, edges, and rules. It includes methods for
10
+ # enumerating the various items in the collection, filtering, and
11
+ # deducing the existence of implied edges using rules.
12
+ class PileOrganizer
13
+
14
+ # see nodepile/base_structs.rb for definition of an Nodepile::EntityPacket
15
+ class ERecStack; end #ERecStack defined further down
16
+ class RuleCache; end # defined further down
17
+ SourceData = Struct.new(:source_name,:highest_sequence_num)
18
+
19
+
20
+ def initialize()
21
+ @nodes = Hash.new{|h,k| h[k] = ERecStack.new}
22
+ @edges = Hash.new{|h,k| h[k] = ERecStack.new}
23
+ @rules = Array.new # not subject to overlaying with themselves
24
+ @pragmas = Nodepile::Pragmas.new
25
+ @sources = Hash.new{|h,k| h[k] = SourceData.new(k,0)} # insert a dummy source for unspecifie
26
+ @last_source_name = nil
27
+ @dirty = true
28
+ end
29
+
30
+ # If a source name is not specified, then the source is assumed to be
31
+ # the last source that was used to append. If no sequence number is provided
32
+ # then the sequence_number is assumed to be one more than the highest
33
+ # sequence number that was specified. If callers are manually specifying
34
+ # sequence numbers for a source, they should do so consistently to avoid
35
+ # repeats.
36
+ # @param kaa [KeyedArrayAccessor] Includes metadata about @type, @key, @is_implied
37
+ # @return [self]
38
+ def append(kaa)
39
+ @last_source_name = kaa.source || @last_source_name
40
+ source_data = @sources[@last_source_name]
41
+ # note that the way things work below deliberately "overlays" items
42
+ # when a matching key is encountered. Rule recalculation is deferred
43
+ case kaa['@type']
44
+ when :node
45
+ @nodes[kaa['@key']] << kaa
46
+ when :edge
47
+ @edges[kaa['@key']] << kaa
48
+ when :rule
49
+ @rules << RuleCache.new(kaa)
50
+ when :pragma
51
+ @pragmas.parse(kaa['_id'])
52
+ else
53
+ raise "Unhandled entity entity type #{kaa['@type'].inspect}"
54
+ end #case
55
+ return self
56
+ end
57
+
58
+ def node_count() = @nodes.length
59
+ def rule_count() = @rules.length
60
+ def edge_count() = @edges.length
61
+ def pragmas() = @pragmas
62
+
63
+ def entity_record(key)
64
+ _update_rule_impacts
65
+ case key
66
+ when String
67
+ return @nodes[key]
68
+ when Array
69
+ return @edges[key]
70
+ else
71
+ raise "Unrecognized key structure/type"
72
+ end
73
+ end
74
+
75
+ # Provide summarized records in order
76
+ def edge_records
77
+ return enum_for(:edge_records) unless block_given?
78
+ _update_rule_impacts
79
+ @edges.each_value{|erstack| yield(erstack.summary) }
80
+ end
81
+
82
+ # Provide the summarized records
83
+ def node_records
84
+ return enum_for(:node_records) unless block_given?
85
+ _update_rule_impacts
86
+ @nodes.each_value{|erstack| yield(erstack.summary) }
87
+ end
88
+
89
+
90
+ # Alias for #append()
91
+ # @param entity_record [Nodepile::KeyedArrayAccessor]
92
+ def <<(entity_record) = append(entity_record)
93
+
94
+ # Loads the given file (on top of anything already stored in this object)
95
+ def load_from_file(tabular_filepath)
96
+ source = Nodepile::TabularRecordSource.new(tabular_filepath,format: :guess)
97
+ specs = nil
98
+ loaded_entity_count = 0
99
+ rec_src_meta = {'path' => tabular_filepath,'rec_num' => nil}
100
+ metadata = Hash.new
101
+ source.each{|(rec,rec_num)|
102
+ rec_src_meta['rec_num'] = rec_num
103
+ if specs.nil? #first row is header
104
+ specs = Nodepile::InputColumnSpecs.new(rec,metadata_key_prefix: '@')
105
+ else
106
+ begin
107
+ specs.parse(rec,source: tabular_filepath,
108
+ ref_num: rec_num,
109
+ metadata: metadata,
110
+ ){|keyed_array_accessor|
111
+ append(keyed_array_accessor)
112
+ loaded_entity_count += 1
113
+ }
114
+ rescue Nodepile::InputColumnSpecs::InvalidRecordError => err
115
+ # re-raise but add info about the record number that triggered the error
116
+ err.rec_num = rec_num
117
+ err.file_path = tabular_filepath
118
+ raise # re-raise
119
+ end
120
+ end #if
121
+ }
122
+ return loaded_entity_count
123
+ end
124
+
125
+
126
+ private
127
+ # This sledgehammer approach deletes all calculated impacts that may
128
+ # have previously been applied to the @edges and @nodes and recalculates
129
+ # all of them.
130
+ #
131
+ # Method is a no-op if the structrues are up-to-date
132
+ def _update_rule_impacts(force = false)
133
+ return nil unless force || @dirty
134
+ @nodes.each_value(&:purge_rule_overlays)
135
+ @edges.each_value(&:purge_rule_overlays)
136
+ @rules.each{|rulecache|
137
+ recs = (rulecache.relevant_entity_type == :node ? @nodes : @edges).each_value
138
+ recs.each{|erstack|
139
+ if rulecache.match?(erstack.summary)
140
+ # calculate the rule as applied to the given node/edge
141
+ calculated_rule_erec = rulecache.eval_using(erstack.summary)
142
+ erstack << calculated_rule_erec
143
+ end
144
+ }
145
+ } #loop over rules
146
+ @dirty = false
147
+ end
148
+
149
+ # An ERecStack is a data structure used for holding and summarizing
150
+ # overlay-able records related to a given Node or Edge which can include
151
+ # "rules" that apply to that node/stack
152
+ class ERecStack
153
+ def initialize()
154
+ @a = Array.new
155
+ @summary = nil
156
+ @mc = CrudeCalculationCache.new
157
+ end
158
+
159
+ def inspect
160
+ "#<#{self.class}:0x#{object_id} type= #{type} key=#{self.key.inspect} depth=#{@a.length}> "
161
+ end
162
+
163
+
164
+ def type = @a.first['@type']
165
+ def key() = @a.first['@key']
166
+
167
+ def is_node? = self.type == :node
168
+ def is_edge? = self.type == :edge
169
+ def summary() = @summary
170
+ def to_a = @a
171
+
172
+ # A stack of type :node or :edge is implied if it contains
173
+ # no ERec records where the is_implied attribute is false.
174
+ # The return value of this method is undefined for types
175
+ def is_implied
176
+ @a.each{|kaa| return false if !kaa['@is_implied'] &&
177
+ [:node,:edge].include?(kaa['@type']) }
178
+ return true
179
+ end
180
+
181
+ # Delete overlayed rule records
182
+ # @return [void]
183
+ def purge_rule_overlays()
184
+ @a.delete_if{|rec| rec['@type'] == :rule}
185
+ @a.each{|rec| @summary = self.class._update_summary(@summary,rec)} #recalc
186
+ end
187
+
188
+
189
+ # Note that this method does not verify whether it is appropriate to stack the new
190
+ # record and assumes callers have alreay done this due-diligence.
191
+ # @param erec [KeyedArrayAccessor]
192
+ def <<(rec)
193
+ raise "ERecStack may only hold objects of type Nodepile::KeyedArrayAccessor" unless rec.is_a?(Nodepile::KeyedArrayAccessor)
194
+ # Keep the summary up-to-date if we've got one.
195
+ @a << rec
196
+ if @a.length == 1
197
+ @summary = rec.dup
198
+ else
199
+ @summary = self.class._update_summary(@summary,rec)
200
+ end
201
+ return self
202
+ end
203
+
204
+ def each_keyed_array()
205
+ return enum_for(:each_keyed_array) unless block_given?
206
+ @a.each{|erec| yield erec }
207
+ end
208
+
209
+ private
210
+ def self._update_summary(cur_summary,new_overlay)
211
+ return new_overlay if cur_summary.nil?
212
+ cur_summary.underlay!(new_overlay)
213
+ cur_summary.source = nil if cur_summary.source != new_overlay.source
214
+ cur_summary.ref_num = nil #summary no longer represents a single ref_num
215
+ cur_summary.update_metadata('@is_implied',false) unless new_overlay['@is_implied']
216
+ return cur_summary
217
+ end
218
+
219
+ end #class Nodepile::PileOrganizer::ERecStack
220
+
221
+ # Represents cached information about a single specific rule
222
+ class RuleCache
223
+
224
+ # @param rule_erec[KeyedArrayAccessor] created from this rule record
225
+ def initialize(rule_erec)
226
+ @er = rule_erec.freeze #just for to avoid casual alteration
227
+ raise "Only ERecs of type :rule may be stored in this structure" unless self.type == :rule
228
+ @verifiers = [*self.key].map{|s| Nodepile::InputColumnSpecs.make_pattern_match_verifier(s)}
229
+ @rule_eval = RuleRecordEvaluator.new(@er)
230
+ end
231
+
232
+ def inspect
233
+ "#<#{self.class}:0x#{object_id} key=#{self.key.inspect} > "
234
+ end
235
+
236
+ def type = @er['@type']
237
+ def is_implied = @er['@is_implied']
238
+ def key = @er['@key']
239
+
240
+ # Note that a rule that uses dynamic matching cannot precalculate which
241
+ # records it matches and must (to be safe) reclculate the match in response
242
+ # to any changes
243
+ def uses_dynamic_match? = @rule_eval.uses_dynamic_match?
244
+
245
+ # @param kaa [KeyedArrayAccessor] a set of field values that will
246
+ # tested against this rule for matching.
247
+ def match?(kaa) = @rule_eval.match_record?(kaa)
248
+ def eval_using(kaa) = @rule_eval.calculate_rule(kaa)
249
+
250
+
251
+ def relevant_entity_type = @er['@key'].is_a?(String) ? :node : :edge
252
+
253
+ end #class Nodepile::PileOrganizer::RuleCache
254
+
255
+ end #class PileOrganizer
256
+
257
+
258
+ end #module Nodepile
@@ -0,0 +1,97 @@
1
+
2
+
3
+ module Nodepile
4
+
5
+ # Pragmas is a parser and organizer class used to interpret the meaning
6
+ # of pragmas that may appear in source files. By default, pragmas are
7
+ # coded instructions for rendering, parsing, or layout that may be embedded
8
+ # in input files. They are often instructions stored in the "_id" field
9
+ # of an input file that begin with a specific indicating string '#pragma '
10
+ # Pragmas may be used to control things like the specific layout engine that
11
+ # is used to visualize a graph (e.g. dot versus neato)
12
+ # Example pragma lines are.
13
+ #
14
+ # #pragma neato
15
+ # #pragma unflatten
16
+ #
17
+ # Create an instance of a Nodepile::Pragmas object in order to track and
18
+ # interpret the collective pragmas of a given graph. Note that the most
19
+ # common rule in #pragma interpretation is that if two pragmas contradict each
20
+ # other, the furthest down in the file/parsing stream dominates.
21
+ class Pragmas
22
+ DEFAULT_PRAGMA_MARKER = "#pragma "
23
+
24
+ @@indicator_patterns = Array.new # array of [pragma_sym,regexp]
25
+
26
+ def initialize(pragma_marker: DEFAULT_PRAGMA_MARKER)
27
+ @marker = pragma_marker.freeze
28
+ @indicators = Hash.new #name mapped to value
29
+ # if you make this method more complex, remember to update #dup
30
+ end
31
+
32
+ def dup
33
+ c = self.class.new(pragma_marker: @marker)
34
+ c._indicators.merge!(@indicators)
35
+ return c
36
+ end
37
+
38
+ def [](pragma_sym)
39
+ @indicators[pragma_sym]
40
+ end
41
+
42
+ # @yield [pragma_sym,pragma_val] provides pairs of name values as they have
43
+ # been set. Only set values will appear in the yielded set.
44
+ def each_setting_pair
45
+ return enum_for(__method__) unless block_given
46
+ @indicators.each_pair{|k,v| yield(k,v) }
47
+ return @indicators.length
48
+ end
49
+
50
+ # Parse the given pragma and store the meaning for access via square bracket
51
+ # method or the #each_setting_pair method.
52
+ # @return [void]
53
+ def parse(pragma_string)
54
+ raise "Expecting pragma_string to start with [#{@marker}]" unless pragma_string.start_with?(@marker)
55
+
56
+ #TODO: I there are more complicated parsing rules, they should go before
57
+ # the simple fallthrough case of whitespace separated indicators
58
+
59
+ # Simple indicators are single "word" values where each value is delimited
60
+ # by whitespace. If two indicators apply to the same pragma_sym,
61
+ pragma_string[@marker.length..-1].split(/\s+/).each{|s|
62
+ prag_sym,_ = @@indicator_patterns.find{|(prag_sym,rx)| rx.match(s) }
63
+ if prag_sym
64
+ @indicators[prag_sym] = $1
65
+ else
66
+ raise "Unrecognized pragma encountered [#{s}]"
67
+ end
68
+ }
69
+ return nil
70
+ end
71
+
72
+ private
73
+
74
+ # Declare simple indicators are pragmas whose presence or absence is the indicator.
75
+ # such indicators can be stacked (multiple per pragma expression)
76
+ # For example:
77
+ # #pragma neato unflatten
78
+ #
79
+ # The above line would mean that two sepearate effects were being invoked,
80
+ # the use of the "neato" rendering engine and the use of the unflatten
81
+ # setting to improve the aspect ratio of some directed graphs
82
+ def self._decl_simple_indicator(prag_name,alt_regexp)
83
+ @@indicator_patterns<< [prag_name,alt_regexp].freeze
84
+ end
85
+
86
+ _decl_simple_indicator(:layout_engine,/^(dot|neato|fdp|sfdp|circo|twopi|nop2|osage|patchwork)$/)
87
+ _decl_simple_indicator(:directionality,/^(graph|digraph)$/)
88
+ #_decl_simple_indicators(:unflatten,/^(unflatten)$/) # not sure this is supported
89
+
90
+ # Crude accessor (protected) for duplication and merge
91
+ def _indicators = @indicator
92
+
93
+
94
+ end #class Nodepile::Pragmas
95
+
96
+
97
+ end #module Nodepile
@@ -0,0 +1,329 @@
1
+ require 'stringio'
2
+
3
+ module Nodepile
4
+
5
+ # Generates "Factories" for harvesting tabular data from a source stream/file.
6
+ # Includes facilities for parsing common file formats (CSV/TSV).
7
+ # Includes facilities for handling common problems encountered when parsing
8
+ # manually-created tabular data files such as: relevant tabular data is not
9
+ # aligned "top-left", tabular data includes blank or repeated columns,
10
+ # tabular data ends before end of file
11
+ # summary rows appear in the tabular data that need to be ignored.
12
+ class TabularRecordSource
13
+ include Enumerable
14
+
15
+ DEFAULT_LOADING_GUIDELINES = {
16
+ mandatory_headers: [], # this can be extremely important to correctly finding tables
17
+ format: :csv||:tsv||:guess, #assume CSV unless told otherwise
18
+ allow_leading_skip_rows: 10, # arbitrary content that may appear before table
19
+ allow_gap_rows: 2||nil, # entirely blank rows appearing mid-tabl, nil indicates allow infinite
20
+ allow_gap_columns: 1, # columns which have a blank header within the table
21
+ allow_left_offset: 5, # blank columnns allowed left of table
22
+ duplicate_header_rule: :first||:last||:ignore||:rename||:fail, #keep the first
23
+ ignored_header_char: '#', # header names starting with this are just plain ignored
24
+ emit_blank_records: false, # unless true, entirely blank records are not returned
25
+ trim_headers: true, #strip leading and trailing spaces
26
+ }.freeze
27
+
28
+ # Create a new RecordSource intended to read from the specified input
29
+ # and using the parsing strategy specified by the loading guidelines.
30
+ def initialize(source,**loading_guidelines)
31
+ (loading_guidelines.keys - DEFAULT_LOADING_GUIDELINES.keys).tap{|x| raise <<~ERRMSG unless x.empty?}
32
+ Unrecognized named parameters used for RecordSource creation #{x.inspect}
33
+ ERRMSG
34
+ @loading_guidelines = DEFAULT_LOADING_GUIDELINES.merge(loading_guidelines).freeze
35
+ raise "The source must be non-nil" if source.nil?
36
+ @source = source # will lazy load
37
+ @is_mid_read = false # only relevant for non-parallel sources
38
+ @replayable_flag = if @source.is_a?(String)
39
+ :parallel # simultaneous each() is okay
40
+ elsif @source.respond_to?(:rewind)
41
+ :single # can't guarantee simultaneous each() safe
42
+ else
43
+ nil
44
+ end
45
+ end #new
46
+
47
+
48
+ # Yields the "records" of the first "table" encountered in the bound data
49
+ # source according to the parameters it was given. First row yielded is
50
+ # always the header. Raises an error if a header is not found.
51
+ # Beware... depending on the type of data source used at creation, it
52
+ # may not be possible to rewind or retrieve data in parallel.
53
+ # With that said, a filename or String both allow parallel retrieval.
54
+ #
55
+ # Also note that blank strings will be passed through until the specified
56
+ # allow_gap_rows are exceeded. This can mean trailing blanks in long files.
57
+ #
58
+ # @yieldparam [Array] Array includes at least two elements. The first is
59
+ # an Array of "fields". The second element is the record
60
+ # number within the source (zero index). It's important
61
+ # to note if any field contains embedded newlines, the record
62
+ # number is not the same as the line number
63
+ # @return [Integer,Enumerator] Returns enumerator if no block is given.
64
+ # Otherwise returns the count of records yielded excluding
65
+ # the header line.
66
+ def each(&block)
67
+ return enum_for(:each) unless block_given?
68
+ raise "This data source type may only be read once." if @source.nil?
69
+ raise <<~ERRMSG if @is_mid_read && @replayable_flag != :parallel
70
+ For this type of data source, you may not read simultaneously.
71
+ ERRMSG
72
+ @is_mid_read = true
73
+ scanner = self.class._make_record_stream(@source,format: @loading_guidelines[:format])
74
+ scanner = self.class._reposition_to_header_rec(scanner,@loading_guidelines)
75
+ raw_header,header_pos = scanner.next
76
+ header_range = self.class._calc_header_range(raw_header,@loading_guidelines[:allow_gap_columns])
77
+ # process the header line to create a "mask"
78
+ yield [raw_header[header_range],header_pos] # return the trimmed header
79
+ rec_count = self.class._emit_rows(scanner,header_range,
80
+ @loading_guidelines[:emit_blank_records],
81
+ trim_headers: @loading_guidelines[:trim_headers],
82
+ tolerate_blanks: @loading_guidelines[:allow_gap_rows],
83
+ &block
84
+ )
85
+ @is_mid_read = false
86
+ @source = nil if @replayable_flag.nil? # release resources
87
+ return rec_count
88
+ end #each
89
+
90
+
91
+ ###########################################################################
92
+ private
93
+
94
+ SEPARATOR_CHAR_LIST = {tsv: "\t", csv: ','}.freeze
95
+
96
+ # Note, due to the need to terminate if too many blanks, this may not read
97
+ # to the end of the file
98
+ def self._emit_rows(raw_rec_enum,range_mask,emit_blank_records,trim_headers:,
99
+ tolerate_blanks: nil,
100
+ &block
101
+ )
102
+ contig_blank_count = 0
103
+ emitted_record_count = 0
104
+ need_to_trim_row = trim_headers # trim the first one
105
+ loop do
106
+ begin
107
+ rec,pos = raw_rec_enum.next
108
+ masked_rec = rec&.[](range_mask)
109
+ next if masked_rec.nil?
110
+ is_blank_record = masked_rec.all?{|s| s.nil? || /^\s*$/.match?(s)}
111
+ contig_blank_count = is_blank_record ? (contig_blank_count+1) : 0
112
+ if tolerate_blanks && contig_blank_count > tolerate_blanks
113
+ return emitted_record_count # end of records
114
+ else
115
+ if emit_blank_records || !is_blank_record
116
+ if need_to_trim_row # only done once (for header) if at all
117
+ masked_rec.map!{|s| s&.strip}
118
+ need_to_trim_row = false # only first emitted row
119
+ end
120
+ yield [masked_rec,pos]
121
+ emitted_record_count += 1
122
+ end
123
+ end
124
+ rescue StopIteration
125
+ return emitted_record_count # running out of records is an okay end
126
+ end # rescuing
127
+ end #loop over records
128
+ raise "Unexpected issue encountered." # should never get here
129
+ end # self._emit_rows()
130
+
131
+ MIN_NONBLANK_HEADER_BEFORE_GAP_ALLOWED = 3
132
+
133
+ # Given a presumed header, identify the position of the largest contiguous
134
+ # block of non-blank fields and return the range information to be used
135
+ # for scraping successive rows.
136
+ def self._calc_header_range(raw_header_record,max_blank_cols)
137
+ max_blank_cols ||= 0 # default is no blank columns tolerated in header
138
+ blank_tol = max_blank_cols
139
+ ix0 = nil
140
+ runs = Array.new
141
+ (0..(raw_header_record.length-1)).each{|ix|
142
+ if raw_header_record[ix].nil? || /^\s*$/.match?(raw_header_record[ix])
143
+ if ix0.nil?
144
+ # deliberate no-op, in middle of blank run
145
+ elsif blank_tol >= 0 && ix-ix0 >= MIN_NONBLANK_HEADER_BEFORE_GAP_ALLOWED
146
+ # at least two content-filled columns must be found before tolerating blanks
147
+ blank_tol -= 1
148
+ else
149
+ runs << [ix0,ix-1]
150
+ ix0 = nil
151
+ blank_tol = max_blank_cols
152
+ end
153
+ else # non blank
154
+ ix0 ||= ix # record start of run
155
+ blank_tol = max_blank_cols # reset tolerance for blanks
156
+ end
157
+ }
158
+ runs << [ix0,raw_header_record.length-1-(max_blank_cols-blank_tol)] if ix0
159
+ widest = runs.max{|a,b| a[1]-a[0] <=> b[1]-b[0] }
160
+ return widest && (widest[0]..widest[1]) # range defines
161
+ end
162
+
163
+
164
+ # Opens up a record stream based on the source provided and the format rule
165
+ # specified.
166
+ # @param source [] Many different values are possible
167
+ # 1) a filepath to a readable text file
168
+ # 2) a string variable (must contain at least one newline)
169
+ # 3) An enumerable of strings where each string is a record "line" to be parsed individually
170
+ # 4) An enumerable of arrays of strings (where the inner array is the column values).
171
+ # In this case the format parameter is ignored
172
+ # @param format [:csv,:tsv,:guess,Regexp] Indicates how to interpret column delimiters
173
+ # and row delimiters. The format parameter is ignored if the source
174
+ # is an enumerable of
175
+ # @return [Enumerator<Array>] Whose next() method returns two-element arrays
176
+ # Where the first element is the fields of the record
177
+ # (in the form of an array of strings) and the second element
178
+ # is the zero based index indicating the record number within the source
179
+ # Note that the Enumerator returned may not be rewindable/replayable.
180
+ def self._make_record_stream(source,format: :csv||:tsv||:guess||nil)
181
+ col_sep = case format
182
+ when nil
183
+ nil
184
+ when :csv,:tsv
185
+ SEPARATOR_CHAR_LIST[format]
186
+ when :guess
187
+ # in the future, we might be able to guess based on reading
188
+ # the first line and looking for tabs or commas
189
+ if source.is_a?(String) && /\.(csv|tsv)$/i.match(source)
190
+ SEPARATOR_CHAR_LIST[$1.downcase.to_sym]
191
+ else
192
+ raise "Format specified as :guess but unable to deduce format"
193
+ end
194
+ else
195
+ raise "Currently unhandled format specifier [#{format}]"
196
+ end
197
+ case source
198
+ in Enumerable if source.first.is_a?(String)
199
+ # This is the most manual case because of the need to try and detect
200
+ # lines that are split by a quoted multiline string.
201
+ return _chunk_lines(source,col_sep)
202
+ in Enumerable if source.first.is_a?(Array) and source.first.first.is_a?(String)
203
+ # no need for further processing, assume it already is a record source
204
+ return source.each_with_index
205
+ in String if source.include?("\n")
206
+ # if passed a string, it must be multiline to be treated as the data source
207
+ return CSV.parse(source,col_sep: col_sep).each_with_index
208
+ in String if !File.exist?(source)
209
+ raise "Unable to find the indicated file: #{source}"
210
+ in String if File.exist?(source) # presumed to be valid filepath
211
+ return CSV.foreach(source,col_sep: col_sep).each_with_index
212
+ end #case source
213
+ raise "Unable to convert the provided source into a record stream"
214
+ end # self.make_record_stream()
215
+
216
+
217
+
218
+ # Tests a string to see if the last field looks like it might have a
219
+ # mutiline field. This is detected by checking for whether the rightmost
220
+ # field has unbalanced quote characters.
221
+ # IMPORTANT NOTE: It relies on being told whether the line begins within a
222
+ # quote
223
+ # (meaning that a complete line contains at least one unquoted separator)
224
+ def self._is_dangling?(line,sep_char,started_in_quote,quot_char: '"')
225
+ qc = Array.new #quote counts
226
+ qc << (start_in_quot ? 1 : 0)
227
+ # count quotes in each field to identify unbalanced quotes
228
+ line.each_char{|c|
229
+ if c == quot_char
230
+ qc[-1] += 1
231
+ elsif c == sep_char && qc.last.even?
232
+ qc << 0
233
+ end
234
+ }
235
+ return qc.last.odd?
236
+ end
237
+
238
+
239
+ # bunches up groups of lines when they look like the last column may
240
+ # contain a multiline value (with embedded carriage return)
241
+ def self._chunk_lines(line_enum,sep_char,&is_continued)
242
+ return enum_for(:_chunk_lines,line_enum,sep_char,&is_continued) unless block_given?
243
+ buf = ""
244
+ ix = 0 # will be one-based counter
245
+ is_in_quote = false
246
+ line_enum.each{|line|
247
+ if _is_dangling?(line,sep_char,is_in_quote)
248
+ is_in_quote = true
249
+ buf.concat(line,line.last == "\n" ? '' : "\n")
250
+ else
251
+ is_in_quote = false
252
+ rec = CSV.parse((buf.empty? ? line : buf.concat(line)),col_sep: sep_char)
253
+ buf.clear
254
+ yield [rec,(ix+=1)-1]
255
+ end
256
+ }
257
+ yield [CSV.parse(buf,col_sep: sep_char),(ix+=1)-1] unless buf.empty?
258
+ return nil #meaningless return value
259
+ end
260
+
261
+
262
+ # Assuming we are starting from the absolute top of the source, scan
263
+ # forward looking for the header row according to the provided guidelines.
264
+ # Note, it may have to read past the header row to ensure it's made the best
265
+ # possible choice for that header row.
266
+ #
267
+ # @param raw_rec_enum [Enumerator] record enumerator for "raw" records such
268
+ # as is generated by _make_record_stream
269
+ # @param guidelines [Hash] guidelines package as generated during
270
+ # instantiation of a class. Note this method is not intended to be called
271
+ # publicly.
272
+ # @return [Enumerator] Enumerator that should replace the enumerator passed
273
+ # in and whose first record is the header row.
274
+ # Important Note: The position of the raw_rec_enum is almost certain to be
275
+ # changed by calling next() on it. It should not be used after this
276
+ # call because of this and other buffer considerations
277
+ def self._reposition_to_header_rec(raw_rec_enum,guidelines)
278
+ buffer = Array.new
279
+ begin
280
+ loop do
281
+ buffer << raw_rec_enum.next
282
+ break if buffer.length > guidelines[:allow_leading_skip_rows]
283
+ end
284
+ rescue StopIteration #deliberately a no-op
285
+ return nil if buffer.empty?
286
+ end
287
+ scores = Hash.new{|h,ix| h[ix] = 0} # scoring for possible header row
288
+ mand_cols = guidelines[:mandatory_headers]
289
+ buffer.each_with_index{|(rec,_),buf_pos|
290
+ hdr_range = _calc_header_range(rec,guidelines[:allow_gap_columns]) # best possible header range
291
+ next if hdr_range.nil?
292
+ if mand_cols.empty? || (mand_cols - rec[hdr_range]).empty?
293
+ scores[buf_pos] = 10*(hdr_range.size) + # prefer wide columns
294
+ (mand_cols.empty? ? 0 : 99000) + # huge bonus for having mandatory columns
295
+ (1- buf_pos.to_f/buffer.length) # slight preference for early records
296
+ end
297
+ # possible other factors for future consideration:
298
+ # preceding blank line
299
+ # containing a non-blank row immediately beneath it
300
+ } # end examination of rows in the buffer
301
+ best_guess = scores.max{|a,b| a[1] <=> b[1] }
302
+ if best_guess.nil?
303
+ raise "Unable to find header record within the first #{[buffer.length,guidelines[:allow_leading_skip_rows]].min} records examined!"
304
+ end
305
+ #buffer[best_guess[0]..-1].to_enum + raw_rec_enum # chain enumerators to include buffer
306
+ return self._joined_enum(buffer,best_guess[0]..buffer.length, raw_rec_enum)
307
+ end
308
+
309
+ # This hack-method was added because for some reason Enumerator::Chain
310
+ # does not seem to support the next() method as I had to hand-roll
311
+ # the Chain
312
+ # return [nil,Enumerator]
313
+ def self._joined_enum(buffer1,buffer1_range,record_enum)
314
+ return enum_for(:_joined_enum,buffer1,buffer1_range,record_enum) unless block_given?
315
+ buffer1_range.each{|ix| yield buffer1[ix] }
316
+ buffer1 = nil # release (in case it matters)
317
+ begin
318
+ loop do
319
+ yield record_enum.next
320
+ end
321
+ rescue StopIteration #deliberately a no-op
322
+ end
323
+ return nil # meaningless return value
324
+ end
325
+
326
+ end #class TabularRecordSource
327
+
328
+
329
+ end #module Nodepile