nodepile 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,258 @@
1
+ require 'nodepile/colspecs.rb'
2
+ require 'nodepile/rec_source.rb'
3
+ require 'nodepile/pragmas.rb'
4
+ require 'nodepile/rule_eval.rb'
5
+
6
+ module Nodepile
7
+
8
+ # Container class for managing a Nodepile. A nodepile consists of a set of
9
+ # entities including nodes, edges, and rules. It includes methods for
10
+ # enumerating the various items in the collection, filtering, and
11
+ # deducing the existence of implied edges using rules.
12
+ class PileOrganizer
13
+
14
+ # see nodepile/base_structs.rb for definition of an Nodepile::EntityPacket
15
+ class ERecStack; end #ERecStack defined further down
16
+ class RuleCache; end # defined further down
17
+ SourceData = Struct.new(:source_name,:highest_sequence_num)
18
+
19
+
20
+ def initialize()
21
+ @nodes = Hash.new{|h,k| h[k] = ERecStack.new}
22
+ @edges = Hash.new{|h,k| h[k] = ERecStack.new}
23
+ @rules = Array.new # not subject to overlaying with themselves
24
+ @pragmas = Nodepile::Pragmas.new
25
+ @sources = Hash.new{|h,k| h[k] = SourceData.new(k,0)} # insert a dummy source for unspecifie
26
+ @last_source_name = nil
27
+ @dirty = true
28
+ end
29
+
30
+ # If a source name is not specified, then the source is assumed to be
31
+ # the last source that was used to append. If no sequence number is provided
32
+ # then the sequence_number is assumed to be one more than the highest
33
+ # sequence number that was specified. If callers are manually specifying
34
+ # sequence numbers for a source, they should do so consistently to avoid
35
+ # repeats.
36
+ # @param kaa [KeyedArrayAccessor] Includes metadata about @type, @key, @is_implied
37
+ # @return [self]
38
+ def append(kaa)
39
+ @last_source_name = kaa.source || @last_source_name
40
+ source_data = @sources[@last_source_name]
41
+ # note that the way things work below deliberately "overlays" items
42
+ # when a matching key is encountered. Rule recalculation is deferred
43
+ case kaa['@type']
44
+ when :node
45
+ @nodes[kaa['@key']] << kaa
46
+ when :edge
47
+ @edges[kaa['@key']] << kaa
48
+ when :rule
49
+ @rules << RuleCache.new(kaa)
50
+ when :pragma
51
+ @pragmas.parse(kaa['_id'])
52
+ else
53
+ raise "Unhandled entity entity type #{kaa['@type'].inspect}"
54
+ end #case
55
+ return self
56
+ end
57
+
58
+ def node_count() = @nodes.length
59
+ def rule_count() = @rules.length
60
+ def edge_count() = @edges.length
61
+ def pragmas() = @pragmas
62
+
63
+ def entity_record(key)
64
+ _update_rule_impacts
65
+ case key
66
+ when String
67
+ return @nodes[key]
68
+ when Array
69
+ return @edges[key]
70
+ else
71
+ raise "Unrecognized key structure/type"
72
+ end
73
+ end
74
+
75
+ # Provide summarized records in order
76
+ def edge_records
77
+ return enum_for(:edge_records) unless block_given?
78
+ _update_rule_impacts
79
+ @edges.each_value{|erstack| yield(erstack.summary) }
80
+ end
81
+
82
+ # Provide the summarized records
83
+ def node_records
84
+ return enum_for(:node_records) unless block_given?
85
+ _update_rule_impacts
86
+ @nodes.each_value{|erstack| yield(erstack.summary) }
87
+ end
88
+
89
+
90
+ # Alias for #append()
91
+ # @param entity_record [Nodepile::KeyedArrayAccessor]
92
+ def <<(entity_record) = append(entity_record)
93
+
94
+ # Loads the given file (on top of anything already stored in this object)
95
+ def load_from_file(tabular_filepath)
96
+ source = Nodepile::TabularRecordSource.new(tabular_filepath,format: :guess)
97
+ specs = nil
98
+ loaded_entity_count = 0
99
+ rec_src_meta = {'path' => tabular_filepath,'rec_num' => nil}
100
+ metadata = Hash.new
101
+ source.each{|(rec,rec_num)|
102
+ rec_src_meta['rec_num'] = rec_num
103
+ if specs.nil? #first row is header
104
+ specs = Nodepile::InputColumnSpecs.new(rec,metadata_key_prefix: '@')
105
+ else
106
+ begin
107
+ specs.parse(rec,source: tabular_filepath,
108
+ ref_num: rec_num,
109
+ metadata: metadata,
110
+ ){|keyed_array_accessor|
111
+ append(keyed_array_accessor)
112
+ loaded_entity_count += 1
113
+ }
114
+ rescue Nodepile::InputColumnSpecs::InvalidRecordError => err
115
+ # re-raise but add info about the record number that triggered the error
116
+ err.rec_num = rec_num
117
+ err.file_path = tabular_filepath
118
+ raise # re-raise
119
+ end
120
+ end #if
121
+ }
122
+ return loaded_entity_count
123
+ end
124
+
125
+
126
+ private
127
+ # This sledgehammer approach deletes all calculated impacts that may
128
+ # have previously been applied to the @edges and @nodes and recalculates
129
+ # all of them.
130
+ #
131
+ # Method is a no-op if the structrues are up-to-date
132
+ def _update_rule_impacts(force = false)
133
+ return nil unless force || @dirty
134
+ @nodes.each_value(&:purge_rule_overlays)
135
+ @edges.each_value(&:purge_rule_overlays)
136
+ @rules.each{|rulecache|
137
+ recs = (rulecache.relevant_entity_type == :node ? @nodes : @edges).each_value
138
+ recs.each{|erstack|
139
+ if rulecache.match?(erstack.summary)
140
+ # calculate the rule as applied to the given node/edge
141
+ calculated_rule_erec = rulecache.eval_using(erstack.summary)
142
+ erstack << calculated_rule_erec
143
+ end
144
+ }
145
+ } #loop over rules
146
+ @dirty = false
147
+ end
148
+
149
+ # An ERecStack is a data structure used for holding and summarizing
150
+ # overlay-able records related to a given Node or Edge which can include
151
+ # "rules" that apply to that node/stack
152
+ class ERecStack
153
+ def initialize()
154
+ @a = Array.new
155
+ @summary = nil
156
+ @mc = CrudeCalculationCache.new
157
+ end
158
+
159
+ def inspect
160
+ "#<#{self.class}:0x#{object_id} type= #{type} key=#{self.key.inspect} depth=#{@a.length}> "
161
+ end
162
+
163
+
164
+ def type = @a.first['@type']
165
+ def key() = @a.first['@key']
166
+
167
+ def is_node? = self.type == :node
168
+ def is_edge? = self.type == :edge
169
+ def summary() = @summary
170
+ def to_a = @a
171
+
172
+ # A stack of type :node or :edge is implied if it contains
173
+ # no ERec records where the is_implied attribute is false.
174
+ # The return value of this method is undefined for types
175
+ def is_implied
176
+ @a.each{|kaa| return false if !kaa['@is_implied'] &&
177
+ [:node,:edge].include?(kaa['@type']) }
178
+ return true
179
+ end
180
+
181
+ # Delete overlayed rule records
182
+ # @return [void]
183
+ def purge_rule_overlays()
184
+ @a.delete_if{|rec| rec['@type'] == :rule}
185
+ @a.each{|rec| @summary = self.class._update_summary(@summary,rec)} #recalc
186
+ end
187
+
188
+
189
+ # Note that this method does not verify whether it is appropriate to stack the new
190
+ # record and assumes callers have alreay done this due-diligence.
191
+ # @param erec [KeyedArrayAccessor]
192
+ def <<(rec)
193
+ raise "ERecStack may only hold objects of type Nodepile::KeyedArrayAccessor" unless rec.is_a?(Nodepile::KeyedArrayAccessor)
194
+ # Keep the summary up-to-date if we've got one.
195
+ @a << rec
196
+ if @a.length == 1
197
+ @summary = rec.dup
198
+ else
199
+ @summary = self.class._update_summary(@summary,rec)
200
+ end
201
+ return self
202
+ end
203
+
204
+ def each_keyed_array()
205
+ return enum_for(:each_keyed_array) unless block_given?
206
+ @a.each{|erec| yield erec }
207
+ end
208
+
209
+ private
210
+ def self._update_summary(cur_summary,new_overlay)
211
+ return new_overlay if cur_summary.nil?
212
+ cur_summary.underlay!(new_overlay)
213
+ cur_summary.source = nil if cur_summary.source != new_overlay.source
214
+ cur_summary.ref_num = nil #summary no longer represents a single ref_num
215
+ cur_summary.update_metadata('@is_implied',false) unless new_overlay['@is_implied']
216
+ return cur_summary
217
+ end
218
+
219
+ end #class Nodepile::PileOrganizer::ERecStack
220
+
221
+ # Represents cached information about a single specific rule
222
+ class RuleCache
223
+
224
+ # @param rule_erec[KeyedArrayAccessor] created from this rule record
225
+ def initialize(rule_erec)
226
+ @er = rule_erec.freeze #just for to avoid casual alteration
227
+ raise "Only ERecs of type :rule may be stored in this structure" unless self.type == :rule
228
+ @verifiers = [*self.key].map{|s| Nodepile::InputColumnSpecs.make_pattern_match_verifier(s)}
229
+ @rule_eval = RuleRecordEvaluator.new(@er)
230
+ end
231
+
232
+ def inspect
233
+ "#<#{self.class}:0x#{object_id} key=#{self.key.inspect} > "
234
+ end
235
+
236
+ def type = @er['@type']
237
+ def is_implied = @er['@is_implied']
238
+ def key = @er['@key']
239
+
240
+ # Note that a rule that uses dynamic matching cannot precalculate which
241
+ # records it matches and must (to be safe) reclculate the match in response
242
+ # to any changes
243
+ def uses_dynamic_match? = @rule_eval.uses_dynamic_match?
244
+
245
+ # @param kaa [KeyedArrayAccessor] a set of field values that will
246
+ # tested against this rule for matching.
247
+ def match?(kaa) = @rule_eval.match_record?(kaa)
248
+ def eval_using(kaa) = @rule_eval.calculate_rule(kaa)
249
+
250
+
251
+ def relevant_entity_type = @er['@key'].is_a?(String) ? :node : :edge
252
+
253
+ end #class Nodepile::PileOrganizer::RuleCache
254
+
255
+ end #class PileOrganizer
256
+
257
+
258
+ end #module Nodepile
@@ -0,0 +1,97 @@
1
+
2
+
3
+ module Nodepile
4
+
5
+ # Pragmas is a parser and organizer class used to interpret the meaning
6
+ # of pragmas that may appear in source files. By default, pragmas are
7
+ # coded instructions for rendering, parsing, or layout that may be embedded
8
+ # in input files. They are often instructions stored in the "_id" field
9
+ # of an input file that begin with a specific indicating string '#pragma '
10
+ # Pragmas may be used to control things like the specific layout engine that
11
+ # is used to visualize a graph (e.g. dot versus neato)
12
+ # Example pragma lines are.
13
+ #
14
+ # #pragma neato
15
+ # #pragma unflatten
16
+ #
17
+ # Create an instance of a Nodepile::Pragmas object in order to track and
18
+ # interpret the collective pragmas of a given graph. Note that the most
19
+ # common rule in #pragma interpretation is that if two pragmas contradict each
20
+ # other, the furthest down in the file/parsing stream dominates.
21
+ class Pragmas
22
+ DEFAULT_PRAGMA_MARKER = "#pragma "
23
+
24
+ @@indicator_patterns = Array.new # array of [pragma_sym,regexp]
25
+
26
+ def initialize(pragma_marker: DEFAULT_PRAGMA_MARKER)
27
+ @marker = pragma_marker.freeze
28
+ @indicators = Hash.new #name mapped to value
29
+ # if you make this method more complex, remember to update #dup
30
+ end
31
+
32
+ def dup
33
+ c = self.class.new(pragma_marker: @marker)
34
+ c._indicators.merge!(@indicators)
35
+ return c
36
+ end
37
+
38
+ def [](pragma_sym)
39
+ @indicators[pragma_sym]
40
+ end
41
+
42
+ # @yield [pragma_sym,pragma_val] provides pairs of name values as they have
43
+ # been set. Only set values will appear in the yielded set.
44
+ def each_setting_pair
45
+ return enum_for(__method__) unless block_given
46
+ @indicators.each_pair{|k,v| yield(k,v) }
47
+ return @indicators.length
48
+ end
49
+
50
+ # Parse the given pragma and store the meaning for access via square bracket
51
+ # method or the #each_setting_pair method.
52
+ # @return [void]
53
+ def parse(pragma_string)
54
+ raise "Expecting pragma_string to start with [#{@marker}]" unless pragma_string.start_with?(@marker)
55
+
56
+ #TODO: I there are more complicated parsing rules, they should go before
57
+ # the simple fallthrough case of whitespace separated indicators
58
+
59
+ # Simple indicators are single "word" values where each value is delimited
60
+ # by whitespace. If two indicators apply to the same pragma_sym,
61
+ pragma_string[@marker.length..-1].split(/\s+/).each{|s|
62
+ prag_sym,_ = @@indicator_patterns.find{|(prag_sym,rx)| rx.match(s) }
63
+ if prag_sym
64
+ @indicators[prag_sym] = $1
65
+ else
66
+ raise "Unrecognized pragma encountered [#{s}]"
67
+ end
68
+ }
69
+ return nil
70
+ end
71
+
72
+ private
73
+
74
+ # Declare simple indicators are pragmas whose presence or absence is the indicator.
75
+ # such indicators can be stacked (multiple per pragma expression)
76
+ # For example:
77
+ # #pragma neato unflatten
78
+ #
79
+ # The above line would mean that two sepearate effects were being invoked,
80
+ # the use of the "neato" rendering engine and the use of the unflatten
81
+ # setting to improve the aspect ratio of some directed graphs
82
+ def self._decl_simple_indicator(prag_name,alt_regexp)
83
+ @@indicator_patterns<< [prag_name,alt_regexp].freeze
84
+ end
85
+
86
+ _decl_simple_indicator(:layout_engine,/^(dot|neato|fdp|sfdp|circo|twopi|nop2|osage|patchwork)$/)
87
+ _decl_simple_indicator(:directionality,/^(graph|digraph)$/)
88
+ #_decl_simple_indicators(:unflatten,/^(unflatten)$/) # not sure this is supported
89
+
90
+ # Crude accessor (protected) for duplication and merge
91
+ def _indicators = @indicator
92
+
93
+
94
+ end #class Nodepile::Pragmas
95
+
96
+
97
+ end #module Nodepile
@@ -0,0 +1,329 @@
1
+ require 'stringio'
2
+
3
+ module Nodepile
4
+
5
+ # Generates "Factories" for harvesting tabular data from a source stream/file.
6
+ # Includes facilities for parsing common file formats (CSV/TSV).
7
+ # Includes facilities for handling common problems encountered when parsing
8
+ # manually-created tabular data files such as: relevant tabular data is not
9
+ # aligned "top-left", tabular data includes blank or repeated columns,
10
+ # tabular data ends before end of file
11
+ # summary rows appear in the tabular data that need to be ignored.
12
+ class TabularRecordSource
13
+ include Enumerable
14
+
15
+ DEFAULT_LOADING_GUIDELINES = {
16
+ mandatory_headers: [], # this can be extremely important to correctly finding tables
17
+ format: :csv||:tsv||:guess, #assume CSV unless told otherwise
18
+ allow_leading_skip_rows: 10, # arbitrary content that may appear before table
19
+ allow_gap_rows: 2||nil, # entirely blank rows appearing mid-tabl, nil indicates allow infinite
20
+ allow_gap_columns: 1, # columns which have a blank header within the table
21
+ allow_left_offset: 5, # blank columnns allowed left of table
22
+ duplicate_header_rule: :first||:last||:ignore||:rename||:fail, #keep the first
23
+ ignored_header_char: '#', # header names starting with this are just plain ignored
24
+ emit_blank_records: false, # unless true, entirely blank records are not returned
25
+ trim_headers: true, #strip leading and trailing spaces
26
+ }.freeze
27
+
28
+ # Create a new RecordSource intended to read from the specified input
29
+ # and using the parsing strategy specified by the loading guidelines.
30
+ def initialize(source,**loading_guidelines)
31
+ (loading_guidelines.keys - DEFAULT_LOADING_GUIDELINES.keys).tap{|x| raise <<~ERRMSG unless x.empty?}
32
+ Unrecognized named parameters used for RecordSource creation #{x.inspect}
33
+ ERRMSG
34
+ @loading_guidelines = DEFAULT_LOADING_GUIDELINES.merge(loading_guidelines).freeze
35
+ raise "The source must be non-nil" if source.nil?
36
+ @source = source # will lazy load
37
+ @is_mid_read = false # only relevant for non-parallel sources
38
+ @replayable_flag = if @source.is_a?(String)
39
+ :parallel # simultaneous each() is okay
40
+ elsif @source.respond_to?(:rewind)
41
+ :single # can't guarantee simultaneous each() safe
42
+ else
43
+ nil
44
+ end
45
+ end #new
46
+
47
+
48
+ # Yields the "records" of the first "table" encountered in the bound data
49
+ # source according to the parameters it was given. First row yielded is
50
+ # always the header. Raises an error if a header is not found.
51
+ # Beware... depending on the type of data source used at creation, it
52
+ # may not be possible to rewind or retrieve data in parallel.
53
+ # With that said, a filename or String both allow parallel retrieval.
54
+ #
55
+ # Also note that blank strings will be passed through until the specified
56
+ # allow_gap_rows are exceeded. This can mean trailing blanks in long files.
57
+ #
58
+ # @yieldparam [Array] Array includes at least two elements. The first is
59
+ # an Array of "fields". The second element is the record
60
+ # number within the source (zero index). It's important
61
+ # to note if any field contains embedded newlines, the record
62
+ # number is not the same as the line number
63
+ # @return [Integer,Enumerator] Returns enumerator if no block is given.
64
+ # Otherwise returns the count of records yielded excluding
65
+ # the header line.
66
+ def each(&block)
67
+ return enum_for(:each) unless block_given?
68
+ raise "This data source type may only be read once." if @source.nil?
69
+ raise <<~ERRMSG if @is_mid_read && @replayable_flag != :parallel
70
+ For this type of data source, you may not read simultaneously.
71
+ ERRMSG
72
+ @is_mid_read = true
73
+ scanner = self.class._make_record_stream(@source,format: @loading_guidelines[:format])
74
+ scanner = self.class._reposition_to_header_rec(scanner,@loading_guidelines)
75
+ raw_header,header_pos = scanner.next
76
+ header_range = self.class._calc_header_range(raw_header,@loading_guidelines[:allow_gap_columns])
77
+ # process the header line to create a "mask"
78
+ yield [raw_header[header_range],header_pos] # return the trimmed header
79
+ rec_count = self.class._emit_rows(scanner,header_range,
80
+ @loading_guidelines[:emit_blank_records],
81
+ trim_headers: @loading_guidelines[:trim_headers],
82
+ tolerate_blanks: @loading_guidelines[:allow_gap_rows],
83
+ &block
84
+ )
85
+ @is_mid_read = false
86
+ @source = nil if @replayable_flag.nil? # release resources
87
+ return rec_count
88
+ end #each
89
+
90
+
91
+ ###########################################################################
92
+ private
93
+
94
+ SEPARATOR_CHAR_LIST = {tsv: "\t", csv: ','}.freeze
95
+
96
+ # Note, due to the need to terminate if too many blanks, this may not read
97
+ # to the end of the file
98
+ def self._emit_rows(raw_rec_enum,range_mask,emit_blank_records,trim_headers:,
99
+ tolerate_blanks: nil,
100
+ &block
101
+ )
102
+ contig_blank_count = 0
103
+ emitted_record_count = 0
104
+ need_to_trim_row = trim_headers # trim the first one
105
+ loop do
106
+ begin
107
+ rec,pos = raw_rec_enum.next
108
+ masked_rec = rec&.[](range_mask)
109
+ next if masked_rec.nil?
110
+ is_blank_record = masked_rec.all?{|s| s.nil? || /^\s*$/.match?(s)}
111
+ contig_blank_count = is_blank_record ? (contig_blank_count+1) : 0
112
+ if tolerate_blanks && contig_blank_count > tolerate_blanks
113
+ return emitted_record_count # end of records
114
+ else
115
+ if emit_blank_records || !is_blank_record
116
+ if need_to_trim_row # only done once (for header) if at all
117
+ masked_rec.map!{|s| s&.strip}
118
+ need_to_trim_row = false # only first emitted row
119
+ end
120
+ yield [masked_rec,pos]
121
+ emitted_record_count += 1
122
+ end
123
+ end
124
+ rescue StopIteration
125
+ return emitted_record_count # running out of records is an okay end
126
+ end # rescuing
127
+ end #loop over records
128
+ raise "Unexpected issue encountered." # should never get here
129
+ end # self._emit_rows()
130
+
131
+ MIN_NONBLANK_HEADER_BEFORE_GAP_ALLOWED = 3
132
+
133
+ # Given a presumed header, identify the position of the largest contiguous
134
+ # block of non-blank fields and return the range information to be used
135
+ # for scraping successive rows.
136
+ def self._calc_header_range(raw_header_record,max_blank_cols)
137
+ max_blank_cols ||= 0 # default is no blank columns tolerated in header
138
+ blank_tol = max_blank_cols
139
+ ix0 = nil
140
+ runs = Array.new
141
+ (0..(raw_header_record.length-1)).each{|ix|
142
+ if raw_header_record[ix].nil? || /^\s*$/.match?(raw_header_record[ix])
143
+ if ix0.nil?
144
+ # deliberate no-op, in middle of blank run
145
+ elsif blank_tol >= 0 && ix-ix0 >= MIN_NONBLANK_HEADER_BEFORE_GAP_ALLOWED
146
+ # at least two content-filled columns must be found before tolerating blanks
147
+ blank_tol -= 1
148
+ else
149
+ runs << [ix0,ix-1]
150
+ ix0 = nil
151
+ blank_tol = max_blank_cols
152
+ end
153
+ else # non blank
154
+ ix0 ||= ix # record start of run
155
+ blank_tol = max_blank_cols # reset tolerance for blanks
156
+ end
157
+ }
158
+ runs << [ix0,raw_header_record.length-1-(max_blank_cols-blank_tol)] if ix0
159
+ widest = runs.max{|a,b| a[1]-a[0] <=> b[1]-b[0] }
160
+ return widest && (widest[0]..widest[1]) # range defines
161
+ end
162
+
163
+
164
+ # Opens up a record stream based on the source provided and the format rule
165
+ # specified.
166
+ # @param source [] Many different values are possible
167
+ # 1) a filepath to a readable text file
168
+ # 2) a string variable (must contain at least one newline)
169
+ # 3) An enumerable of strings where each string is a record "line" to be parsed individually
170
+ # 4) An enumerable of arrays of strings (where the inner array is the column values).
171
+ # In this case the format parameter is ignored
172
+ # @param format [:csv,:tsv,:guess,Regexp] Indicates how to interpret column delimiters
173
+ # and row delimiters. The format parameter is ignored if the source
174
+ # is an enumerable of
175
+ # @return [Enumerator<Array>] Whose next() method returns two-element arrays
176
+ # Where the first element is the fields of the record
177
+ # (in the form of an array of strings) and the second element
178
+ # is the zero based index indicating the record number within the source
179
+ # Note that the Enumerator returned may not be rewindable/replayable.
180
+ def self._make_record_stream(source,format: :csv||:tsv||:guess||nil)
181
+ col_sep = case format
182
+ when nil
183
+ nil
184
+ when :csv,:tsv
185
+ SEPARATOR_CHAR_LIST[format]
186
+ when :guess
187
+ # in the future, we might be able to guess based on reading
188
+ # the first line and looking for tabs or commas
189
+ if source.is_a?(String) && /\.(csv|tsv)$/i.match(source)
190
+ SEPARATOR_CHAR_LIST[$1.downcase.to_sym]
191
+ else
192
+ raise "Format specified as :guess but unable to deduce format"
193
+ end
194
+ else
195
+ raise "Currently unhandled format specifier [#{format}]"
196
+ end
197
+ case source
198
+ in Enumerable if source.first.is_a?(String)
199
+ # This is the most manual case because of the need to try and detect
200
+ # lines that are split by a quoted multiline string.
201
+ return _chunk_lines(source,col_sep)
202
+ in Enumerable if source.first.is_a?(Array) and source.first.first.is_a?(String)
203
+ # no need for further processing, assume it already is a record source
204
+ return source.each_with_index
205
+ in String if source.include?("\n")
206
+ # if passed a string, it must be multiline to be treated as the data source
207
+ return CSV.parse(source,col_sep: col_sep).each_with_index
208
+ in String if !File.exist?(source)
209
+ raise "Unable to find the indicated file: #{source}"
210
+ in String if File.exist?(source) # presumed to be valid filepath
211
+ return CSV.foreach(source,col_sep: col_sep).each_with_index
212
+ end #case source
213
+ raise "Unable to convert the provided source into a record stream"
214
+ end # self.make_record_stream()
215
+
216
+
217
+
218
+ # Tests a string to see if the last field looks like it might have a
219
+ # mutiline field. This is detected by checking for whether the rightmost
220
+ # field has unbalanced quote characters.
221
+ # IMPORTANT NOTE: It relies on being told whether the line begins within a
222
+ # quote
223
+ # (meaning that a complete line contains at least one unquoted separator)
224
+ def self._is_dangling?(line,sep_char,started_in_quote,quot_char: '"')
225
+ qc = Array.new #quote counts
226
+ qc << (start_in_quot ? 1 : 0)
227
+ # count quotes in each field to identify unbalanced quotes
228
+ line.each_char{|c|
229
+ if c == quot_char
230
+ qc[-1] += 1
231
+ elsif c == sep_char && qc.last.even?
232
+ qc << 0
233
+ end
234
+ }
235
+ return qc.last.odd?
236
+ end
237
+
238
+
239
+ # bunches up groups of lines when they look like the last column may
240
+ # contain a multiline value (with embedded carriage return)
241
+ def self._chunk_lines(line_enum,sep_char,&is_continued)
242
+ return enum_for(:_chunk_lines,line_enum,sep_char,&is_continued) unless block_given?
243
+ buf = ""
244
+ ix = 0 # will be one-based counter
245
+ is_in_quote = false
246
+ line_enum.each{|line|
247
+ if _is_dangling?(line,sep_char,is_in_quote)
248
+ is_in_quote = true
249
+ buf.concat(line,line.last == "\n" ? '' : "\n")
250
+ else
251
+ is_in_quote = false
252
+ rec = CSV.parse((buf.empty? ? line : buf.concat(line)),col_sep: sep_char)
253
+ buf.clear
254
+ yield [rec,(ix+=1)-1]
255
+ end
256
+ }
257
+ yield [CSV.parse(buf,col_sep: sep_char),(ix+=1)-1] unless buf.empty?
258
+ return nil #meaningless return value
259
+ end
260
+
261
+
262
+ # Assuming we are starting from the absolute top of the source, scan
263
+ # forward looking for the header row according to the provided guidelines.
264
+ # Note, it may have to read past the header row to ensure it's made the best
265
+ # possible choice for that header row.
266
+ #
267
+ # @param raw_rec_enum [Enumerator] record enumerator for "raw" records such
268
+ # as is generated by _make_record_stream
269
+ # @param guidelines [Hash] guidelines package as generated during
270
+ # instantiation of a class. Note this method is not intended to be called
271
+ # publicly.
272
+ # @return [Enumerator] Enumerator that should replace the enumerator passed
273
+ # in and whose first record is the header row.
274
+ # Important Note: The position of the raw_rec_enum is almost certain to be
275
+ # changed by calling next() on it. It should not be used after this
276
+ # call because of this and other buffer considerations
277
+ def self._reposition_to_header_rec(raw_rec_enum,guidelines)
278
+ buffer = Array.new
279
+ begin
280
+ loop do
281
+ buffer << raw_rec_enum.next
282
+ break if buffer.length > guidelines[:allow_leading_skip_rows]
283
+ end
284
+ rescue StopIteration #deliberately a no-op
285
+ return nil if buffer.empty?
286
+ end
287
+ scores = Hash.new{|h,ix| h[ix] = 0} # scoring for possible header row
288
+ mand_cols = guidelines[:mandatory_headers]
289
+ buffer.each_with_index{|(rec,_),buf_pos|
290
+ hdr_range = _calc_header_range(rec,guidelines[:allow_gap_columns]) # best possible header range
291
+ next if hdr_range.nil?
292
+ if mand_cols.empty? || (mand_cols - rec[hdr_range]).empty?
293
+ scores[buf_pos] = 10*(hdr_range.size) + # prefer wide columns
294
+ (mand_cols.empty? ? 0 : 99000) + # huge bonus for having mandatory columns
295
+ (1- buf_pos.to_f/buffer.length) # slight preference for early records
296
+ end
297
+ # possible other factors for future consideration:
298
+ # preceding blank line
299
+ # containing a non-blank row immediately beneath it
300
+ } # end examination of rows in the buffer
301
+ best_guess = scores.max{|a,b| a[1] <=> b[1] }
302
+ if best_guess.nil?
303
+ raise "Unable to find header record within the first #{[buffer.length,guidelines[:allow_leading_skip_rows]].min} records examined!"
304
+ end
305
+ #buffer[best_guess[0]..-1].to_enum + raw_rec_enum # chain enumerators to include buffer
306
+ return self._joined_enum(buffer,best_guess[0]..buffer.length, raw_rec_enum)
307
+ end
308
+
309
+ # This hack-method was added because for some reason Enumerator::Chain
310
+ # does not seem to support the next() method as I had to hand-roll
311
+ # the Chain
312
+ # return [nil,Enumerator]
313
+ def self._joined_enum(buffer1,buffer1_range,record_enum)
314
+ return enum_for(:_joined_enum,buffer1,buffer1_range,record_enum) unless block_given?
315
+ buffer1_range.each{|ix| yield buffer1[ix] }
316
+ buffer1 = nil # release (in case it matters)
317
+ begin
318
+ loop do
319
+ yield record_enum.next
320
+ end
321
+ rescue StopIteration #deliberately a no-op
322
+ end
323
+ return nil # meaningless return value
324
+ end
325
+
326
+ end #class TabularRecordSource
327
+
328
+
329
+ end #module Nodepile