activewarehouse-etl-sgonyea 0.9.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. data/.gitignore +9 -0
  2. data/0.9-UPGRADE +6 -0
  3. data/CHANGELOG +236 -0
  4. data/Gemfile +4 -0
  5. data/HOW_TO_RELEASE +13 -0
  6. data/LICENSE +7 -0
  7. data/README.textile +111 -0
  8. data/Rakefile +103 -0
  9. data/TODO +28 -0
  10. data/active_support_logger.patch +78 -0
  11. data/activewarehouse-etl.gemspec +36 -0
  12. data/bin/etl +28 -0
  13. data/bin/etl.cmd +8 -0
  14. data/examples/database.example.yml +16 -0
  15. data/lib/etl.rb +97 -0
  16. data/lib/etl/batch.rb +2 -0
  17. data/lib/etl/batch/batch.rb +111 -0
  18. data/lib/etl/batch/directives.rb +65 -0
  19. data/lib/etl/builder.rb +2 -0
  20. data/lib/etl/builder/date_dimension_builder.rb +96 -0
  21. data/lib/etl/builder/time_dimension_builder.rb +31 -0
  22. data/lib/etl/commands/etl.rb +89 -0
  23. data/lib/etl/control.rb +3 -0
  24. data/lib/etl/control/control.rb +405 -0
  25. data/lib/etl/control/destination.rb +438 -0
  26. data/lib/etl/control/destination/csv_destination.rb +113 -0
  27. data/lib/etl/control/destination/database_destination.rb +97 -0
  28. data/lib/etl/control/destination/excel_destination.rb +91 -0
  29. data/lib/etl/control/destination/file_destination.rb +126 -0
  30. data/lib/etl/control/destination/insert_update_database_destination.rb +136 -0
  31. data/lib/etl/control/destination/update_database_destination.rb +109 -0
  32. data/lib/etl/control/destination/yaml_destination.rb +74 -0
  33. data/lib/etl/control/source.rb +132 -0
  34. data/lib/etl/control/source/database_source.rb +224 -0
  35. data/lib/etl/control/source/enumerable_source.rb +11 -0
  36. data/lib/etl/control/source/file_source.rb +90 -0
  37. data/lib/etl/control/source/model_source.rb +39 -0
  38. data/lib/etl/core_ext.rb +1 -0
  39. data/lib/etl/core_ext/time.rb +5 -0
  40. data/lib/etl/core_ext/time/calculations.rb +42 -0
  41. data/lib/etl/engine.rb +582 -0
  42. data/lib/etl/execution.rb +19 -0
  43. data/lib/etl/execution/base.rb +8 -0
  44. data/lib/etl/execution/batch.rb +10 -0
  45. data/lib/etl/execution/job.rb +8 -0
  46. data/lib/etl/execution/migration.rb +90 -0
  47. data/lib/etl/generator.rb +2 -0
  48. data/lib/etl/generator/generator.rb +20 -0
  49. data/lib/etl/generator/surrogate_key_generator.rb +39 -0
  50. data/lib/etl/http_tools.rb +139 -0
  51. data/lib/etl/parser.rb +11 -0
  52. data/lib/etl/parser/apache_combined_log_parser.rb +49 -0
  53. data/lib/etl/parser/csv_parser.rb +93 -0
  54. data/lib/etl/parser/excel_parser.rb +112 -0
  55. data/lib/etl/parser/fixed_width_parser.rb +65 -0
  56. data/lib/etl/parser/nokogiri_xml_parser.rb +83 -0
  57. data/lib/etl/parser/parser.rb +41 -0
  58. data/lib/etl/parser/sax_parser.rb +218 -0
  59. data/lib/etl/parser/xml_parser.rb +65 -0
  60. data/lib/etl/processor.rb +11 -0
  61. data/lib/etl/processor/block_processor.rb +14 -0
  62. data/lib/etl/processor/bulk_import_processor.rb +94 -0
  63. data/lib/etl/processor/check_exist_processor.rb +80 -0
  64. data/lib/etl/processor/check_unique_processor.rb +39 -0
  65. data/lib/etl/processor/copy_field_processor.rb +26 -0
  66. data/lib/etl/processor/database_join_processor.rb +82 -0
  67. data/lib/etl/processor/encode_processor.rb +55 -0
  68. data/lib/etl/processor/ensure_fields_presence_processor.rb +24 -0
  69. data/lib/etl/processor/escape_csv_processor.rb +77 -0
  70. data/lib/etl/processor/filter_row_processor.rb +51 -0
  71. data/lib/etl/processor/ftp_downloader_processor.rb +68 -0
  72. data/lib/etl/processor/ftp_uploader_processor.rb +65 -0
  73. data/lib/etl/processor/hierarchy_exploder_processor.rb +55 -0
  74. data/lib/etl/processor/imapattachment_downloader_processor.rb +91 -0
  75. data/lib/etl/processor/pop3attachment_downloader_processor.rb +90 -0
  76. data/lib/etl/processor/print_row_processor.rb +12 -0
  77. data/lib/etl/processor/processor.rb +25 -0
  78. data/lib/etl/processor/rename_processor.rb +24 -0
  79. data/lib/etl/processor/require_non_blank_processor.rb +26 -0
  80. data/lib/etl/processor/row_processor.rb +27 -0
  81. data/lib/etl/processor/sequence_processor.rb +23 -0
  82. data/lib/etl/processor/sftp_downloader_processor.rb +63 -0
  83. data/lib/etl/processor/sftp_uploader_processor.rb +63 -0
  84. data/lib/etl/processor/surrogate_key_processor.rb +53 -0
  85. data/lib/etl/processor/truncate_processor.rb +40 -0
  86. data/lib/etl/processor/zip_file_processor.rb +27 -0
  87. data/lib/etl/row.rb +20 -0
  88. data/lib/etl/screen.rb +14 -0
  89. data/lib/etl/screen/row_count_screen.rb +20 -0
  90. data/lib/etl/transform.rb +2 -0
  91. data/lib/etl/transform/block_transform.rb +13 -0
  92. data/lib/etl/transform/calculation_transform.rb +71 -0
  93. data/lib/etl/transform/date_to_string_transform.rb +20 -0
  94. data/lib/etl/transform/decode_transform.rb +51 -0
  95. data/lib/etl/transform/default_transform.rb +20 -0
  96. data/lib/etl/transform/foreign_key_lookup_transform.rb +211 -0
  97. data/lib/etl/transform/hierarchy_lookup_transform.rb +49 -0
  98. data/lib/etl/transform/md5_transform.rb +13 -0
  99. data/lib/etl/transform/ordinalize_transform.rb +14 -0
  100. data/lib/etl/transform/sha1_transform.rb +13 -0
  101. data/lib/etl/transform/split_fields_transform.rb +27 -0
  102. data/lib/etl/transform/string_to_date_time_transform.rb +14 -0
  103. data/lib/etl/transform/string_to_date_transform.rb +16 -0
  104. data/lib/etl/transform/string_to_time_transform.rb +11 -0
  105. data/lib/etl/transform/transform.rb +61 -0
  106. data/lib/etl/transform/trim_transform.rb +26 -0
  107. data/lib/etl/transform/type_transform.rb +35 -0
  108. data/lib/etl/util.rb +59 -0
  109. data/lib/etl/version.rb +3 -0
  110. data/test-matrix.yml +10 -0
  111. data/test/.gitignore +1 -0
  112. data/test/.ignore +2 -0
  113. data/test/all.ebf +6 -0
  114. data/test/apache_combined_log.ctl +11 -0
  115. data/test/batch_test.rb +41 -0
  116. data/test/batch_with_error.ebf +6 -0
  117. data/test/batched1.ctl +0 -0
  118. data/test/batched2.ctl +0 -0
  119. data/test/block_processor.ctl +6 -0
  120. data/test/block_processor_error.ctl +1 -0
  121. data/test/block_processor_pre_post_process.ctl +4 -0
  122. data/test/block_processor_remove_rows.ctl +5 -0
  123. data/test/block_processor_test.rb +38 -0
  124. data/test/check_exist_processor_test.rb +92 -0
  125. data/test/check_unique_processor_test.rb +40 -0
  126. data/test/config/Gemfile.rails-2.3.x +3 -0
  127. data/test/config/Gemfile.rails-2.3.x.lock +53 -0
  128. data/test/config/Gemfile.rails-3.0.x +3 -0
  129. data/test/config/Gemfile.rails-3.0.x.lock +61 -0
  130. data/test/config/common.rb +29 -0
  131. data/test/connection/mysql/connection.rb +9 -0
  132. data/test/connection/mysql/schema.sql +37 -0
  133. data/test/connection/postgresql/connection.rb +13 -0
  134. data/test/connection/postgresql/schema.sql +40 -0
  135. data/test/control_test.rb +43 -0
  136. data/test/data/apache_combined_log.txt +3 -0
  137. data/test/data/bulk_import.txt +3 -0
  138. data/test/data/bulk_import_with_empties.txt +3 -0
  139. data/test/data/decode.txt +3 -0
  140. data/test/data/delimited.txt +3 -0
  141. data/test/data/encode_source_latin1.txt +2 -0
  142. data/test/data/excel.xls +0 -0
  143. data/test/data/excel2.xls +0 -0
  144. data/test/data/fixed_width.txt +3 -0
  145. data/test/data/multiple_delimited_1.txt +3 -0
  146. data/test/data/multiple_delimited_2.txt +3 -0
  147. data/test/data/nokogiri.xml +38 -0
  148. data/test/data/people.txt +3 -0
  149. data/test/data/sax.xml +14 -0
  150. data/test/data/xml.xml +16 -0
  151. data/test/database_join_processor_test.rb +43 -0
  152. data/test/date_dimension_builder_test.rb +96 -0
  153. data/test/delimited.ctl +30 -0
  154. data/test/delimited_absolute.ctl +31 -0
  155. data/test/delimited_destination_db.ctl +23 -0
  156. data/test/delimited_excel.ctl +31 -0
  157. data/test/delimited_insert_update.ctl +34 -0
  158. data/test/delimited_update.ctl +34 -0
  159. data/test/delimited_with_bulk_load.ctl +34 -0
  160. data/test/destination_test.rb +275 -0
  161. data/test/directive_test.rb +23 -0
  162. data/test/encode_processor_test.rb +32 -0
  163. data/test/engine_test.rb +78 -0
  164. data/test/ensure_fields_presence_processor_test.rb +28 -0
  165. data/test/errors.ctl +24 -0
  166. data/test/etl_test.rb +42 -0
  167. data/test/excel.ctl +24 -0
  168. data/test/excel2.ctl +25 -0
  169. data/test/fixed_width.ctl +35 -0
  170. data/test/foreign_key_lookup_transform_test.rb +50 -0
  171. data/test/generator_test.rb +14 -0
  172. data/test/inline_parser.ctl +17 -0
  173. data/test/mocks/mock_destination.rb +26 -0
  174. data/test/mocks/mock_source.rb +25 -0
  175. data/test/model_source.ctl +14 -0
  176. data/test/multiple_delimited.ctl +22 -0
  177. data/test/multiple_source_delimited.ctl +39 -0
  178. data/test/nokogiri_all.ctl +35 -0
  179. data/test/nokogiri_select.ctl +35 -0
  180. data/test/nokogiri_test.rb +35 -0
  181. data/test/parser_test.rb +224 -0
  182. data/test/performance/delimited.ctl +30 -0
  183. data/test/processor_test.rb +44 -0
  184. data/test/row_processor_test.rb +17 -0
  185. data/test/sax.ctl +26 -0
  186. data/test/scd/1.txt +1 -0
  187. data/test/scd/2.txt +1 -0
  188. data/test/scd/3.txt +1 -0
  189. data/test/scd_test.rb +257 -0
  190. data/test/scd_test_type_1.ctl +43 -0
  191. data/test/scd_test_type_2.ctl +34 -0
  192. data/test/screen_test.rb +9 -0
  193. data/test/screen_test_error.ctl +3 -0
  194. data/test/screen_test_fatal.ctl +3 -0
  195. data/test/source_test.rb +154 -0
  196. data/test/test_helper.rb +37 -0
  197. data/test/transform_test.rb +101 -0
  198. data/test/truncate_processor_test.rb +37 -0
  199. data/test/xml.ctl +31 -0
  200. metadata +370 -0
@@ -0,0 +1,112 @@
1
+ optional_require 'spreadsheet'
2
+
3
+ module ETL
4
+ module Parser
5
+ class ExcelParser < ETL::Parser::Parser
6
+
7
+ attr_accessor :ignore_blank_line
8
+
9
+ # Initialize the parser
10
+ # * <tt>source</tt>: The Source object
11
+ # * <tt>options</tt>: Parser options Hash
12
+ def initialize(source, options={})
13
+ super
14
+ configure
15
+ end
16
+
17
+ # Returns each row
18
+ def each
19
+ Dir.glob(file).each do |file|
20
+ ETL::Engine.logger.debug "parsing #{file}"
21
+ line = 0
22
+ lines_skipped = 0
23
+ book = Spreadsheet.open file
24
+ loopworksheets = []
25
+
26
+ if worksheets.empty?
27
+ loopworksheets = book.worksheets
28
+ else
29
+ worksheets.each do |index|
30
+ loopworksheets << book.worksheet(index)
31
+ end
32
+ end
33
+
34
+ loopworksheets.each do |sheet|
35
+ sheet.each do |raw_row|
36
+ if lines_skipped < source.skip_lines
37
+ ETL::Engine.logger.debug "skipping line"
38
+ lines_skipped += 1
39
+ next
40
+ end
41
+ line += 1
42
+ row = {}
43
+ if self.ignore_blank_line and raw_row.empty?
44
+ lines_skipped += 1
45
+ next
46
+ end
47
+ validate_row(raw_row, line, file)
48
+ raw_row.each_with_index do |value, index|
49
+ f = fields[index]
50
+ row[f.name] = value
51
+ end
52
+ yield row
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ # Get an array of defined worksheets
59
+ def worksheets
60
+ @worksheets ||= []
61
+ end
62
+
63
+ # Get an array of defined fields
64
+ def fields
65
+ @fields ||= []
66
+ end
67
+
68
+ private
69
+ def validate_row(row, line, file)
70
+ ETL::Engine.logger.debug "validating line #{line} in file #{file}"
71
+ if row.length != fields.length
72
+ raise_with_info( MismatchError,
73
+ "The number of columns from the source (#{row.length}) does not match the number of columns in the definition (#{fields.length})",
74
+ line, file
75
+ )
76
+ end
77
+ end
78
+
79
+ private
80
+ def configure
81
+ source.definition[:worksheets].each do |worksheet|
82
+ if Integer(worksheet)
83
+ worksheets << worksheet.to_i
84
+ else
85
+ raise DefinitionError, "Each worksheet definition must be an integer"
86
+ end
87
+ end unless source.definition[:worksheets].nil?
88
+
89
+ self.ignore_blank_line = source.definition[:ignore_blank_line]
90
+
91
+ source.definition[:fields].each do |options|
92
+ case options
93
+ when Symbol
94
+ fields << Field.new(options)
95
+ when Hash
96
+ fields << Field.new(options[:name])
97
+ else
98
+ raise DefinitionError, "Each field definition must either be a symbol or a hash"
99
+ end
100
+ end
101
+ end
102
+
103
+ class Field #:nodoc:
104
+ attr_reader :name
105
+ def initialize(name)
106
+ @name = name
107
+ end
108
+ end
109
+
110
+ end
111
+ end
112
+ end
@@ -0,0 +1,65 @@
1
+ module ETL #:nodoc:
2
+ module Parser #:nodoc:
3
+ # Parser for fixed with files
4
+ class FixedWidthParser < ETL::Parser::Parser
5
+ # Initialize the parser
6
+ # * <tt>source</tt>: The source object
7
+ # * <tt>options</tt>: Parser options Hash
8
+ def initialize(source, options={})
9
+ super
10
+ configure
11
+ end
12
+
13
+ # Return each row
14
+ def each
15
+ Dir.glob(file).each do |file|
16
+ open(file).each do |line|
17
+ row = {}
18
+ lines_skipped = 0
19
+ fields.each do |name, f|
20
+ if lines_skipped < source.skip_lines
21
+ lines_skipped += 1
22
+ next
23
+ end
24
+ # TODO make strip optional?
25
+ row[name] = line[f.field_start, f.field_length].strip
26
+ end
27
+ yield row
28
+ end
29
+ end
30
+ end
31
+
32
+ # Return a map of defined fields
33
+ def fields
34
+ @fields ||= {}
35
+ end
36
+
37
+ private
38
+ def configure
39
+ source.definition.each do |field, options|
40
+ fields[field] = FixedWidthField.new(
41
+ options[:name], options[:start], options[:end], options[:length]
42
+ )
43
+ end
44
+ end
45
+ end
46
+
47
+ class FixedWidthField #:nodoc:
48
+ attr_reader :name, :field_start, :field_end, :field_length
49
+ # Initialize the field.
50
+ def initialize(name, field_start, field_end=nil, field_length=nil)
51
+ @name = name
52
+ @field_start = field_start - 1
53
+ if field_end
54
+ @field_end = field_end
55
+ @field_length = @field_end - @field_start
56
+ elsif field_length
57
+ @field_length = field_length
58
+ @field_end = @field_start + @field_length
59
+ else
60
+ raise DefinitionError, "Either field_end or field_length required"
61
+ end
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,83 @@
1
+ optional_require 'nokogiri'
2
+ require 'open-uri'
3
+ optional_require 'zlib'
4
+
5
+ module ETL
6
+ module Parser
7
+ class NokogiriXmlParser < ETL::Parser::Parser
8
+ # Initialize the parser
9
+ # * <tt>source</tt>: The Source object
10
+ # * <tt>options</tt>: Parser options Hash
11
+ def initialize(source, options={})
12
+ super
13
+ configure
14
+ end
15
+
16
+ # Returns each row
17
+ def each
18
+ Dir.glob(file).each do |source|
19
+
20
+ doc = nil
21
+
22
+ gzip = false
23
+ magic = "1F8B".to_i(base=16) # Check for gzip archives
24
+ if File.exist?(source)
25
+ gzip = true if magic == (
26
+ File.open(source).read(2).unpack("H2H2").to_s.to_i(base=16))
27
+ end
28
+
29
+ if gzip
30
+ doc = Nokogiri::XML(Zlib::GzipReader.open(source))
31
+ else
32
+ doc = Nokogiri::XML(open(source))
33
+ end
34
+
35
+ doc.xpath(@collection_xpath).each do |nodeset|
36
+ row = {}
37
+
38
+ fields.each do |f|
39
+ value = nodeset.xpath(f.xpath).text
40
+ row[f.name] = value
41
+ end
42
+ yield row
43
+ end
44
+
45
+ end
46
+ end
47
+
48
+ # Get an array of defined fields
49
+ def fields
50
+ @fields ||= []
51
+ end
52
+
53
+ private
54
+ def configure
55
+ @collection_xpath = source.definition[:collection]
56
+ if @collection_xpath.nil?
57
+ raise ":collection => 'XPath' argument required"
58
+ end
59
+ source.definition[:fields].each do |options|
60
+ case options
61
+ when Symbol
62
+ fields << Field.new(options, options.to_s)
63
+ when Hash
64
+ options[:xpath] ||= options[:name]
65
+ fields << Field.new(options[:name], options[:xpath].to_s)
66
+ else
67
+ raise DefinitionError,
68
+ "Each field definition must either be an symbol " +
69
+ "or a hash of options for the field"
70
+ end
71
+ end
72
+ end
73
+
74
+ class Field
75
+ attr_reader :name, :xpath
76
+ def initialize(name, xpath)
77
+ @name = name
78
+ @xpath = xpath
79
+ end
80
+ end
81
+ end
82
+ end
83
+ end
@@ -0,0 +1,41 @@
1
+ module ETL #:nodoc:
2
+ module Parser #:nodoc:
3
+ # Base parser class. Implementation classes must extend this class and implement
4
+ # the each method. The each method should return each row of the source data as
5
+ # a Hash.
6
+ class Parser
7
+ include Enumerable
8
+ class << self
9
+ # Convert the name (string or symbol) to a parser class.
10
+ #
11
+ # Example:
12
+ # <tt>class_for_name(:fixed_width)</tt> returns a FixedWidthParser class
13
+ def class_for_name(name)
14
+ ETL::Parser.const_get("#{name.to_s.camelize}Parser")
15
+ end
16
+ end
17
+
18
+ # The Source object for the data
19
+ attr_reader :source
20
+
21
+ # Options Hash for the parser
22
+ attr_reader :options
23
+
24
+ def initialize(source, options={})
25
+ @source = source
26
+ @options = options || {}
27
+ end
28
+
29
+ protected
30
+ def file
31
+ path = Pathname.new(source.configuration[:file])
32
+ path = path.absolute? ? path : Pathname.new(File.dirname(source.control.file)) + path
33
+ path
34
+ end
35
+
36
+ def raise_with_info(error, message, file, line)
37
+ raise error, "#{message} (line #{line} in #{file})"
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,218 @@
1
+ require 'rexml/parsers/sax2parser'
2
+ require 'rexml/sax2listener'
3
+
4
+ module ETL #:nodoc:
5
+ module Parser #:nodoc:
6
+ # ETL parser implementation which uses SAX to parse XML files.
7
+ class SaxParser < ETL::Parser::Parser
8
+
9
+ # The write trigger causes whatever values are currently specified for the row to be returned.
10
+ # After returning the values will not be cleared, thus allowing for values which are assigned
11
+ # higher in the XML tree to remain in memory.
12
+ attr_accessor :write_trigger
13
+
14
+ # Initialize the parser
15
+ # * <tt>source</tt>: The Source object
16
+ # * <tt>options</tt>: Parser options Hash
17
+ def initialize(source, options={})
18
+ super
19
+ configure
20
+ end
21
+
22
+ # Returns each row
23
+ def each(&block)
24
+ Dir.glob(file).each do |file|
25
+ parser = REXML::Parsers::SAX2Parser.new(File.new(file))
26
+ listener = Listener.new(self, &block)
27
+ parser.listen(listener)
28
+ parser.parse
29
+ end
30
+ end
31
+
32
+ # Get an array of Field objects
33
+ def fields
34
+ @fields ||= []
35
+ end
36
+
37
+ private
38
+ def configure
39
+ #puts "write trigger in source.definition: #{source.definition[:write_trigger]}"
40
+ self.write_trigger = source.definition[:write_trigger]
41
+ # map paths to field names
42
+ source.definition[:fields].each do |name, path|
43
+ #puts "defined field #{name}, path: #{path}"
44
+ fields << Field.new(name, XPath::Path.parse(path))
45
+ end
46
+ end
47
+
48
+ # Class representing a field to be loaded from the source
49
+ class Field
50
+ # The name of the field
51
+ attr_reader :name
52
+ # The XPath-like path to the field in the XML document
53
+ attr_reader :path
54
+
55
+ def initialize(name, path) #:nodoc
56
+ @name = name
57
+ @path = path
58
+ end
59
+ end
60
+ end
61
+
62
+ class Listener #:nodoc:
63
+ include REXML::SAX2Listener
64
+ def initialize(parser, &block)
65
+ @parser = parser
66
+ @row = {}
67
+ @value = nil
68
+ @proc = Proc.new(&block)
69
+ end
70
+ def cdata(text)
71
+ @value << text
72
+ end
73
+ def characters(text)
74
+ text = text.strip
75
+ if (!text.nil? && text != '')
76
+ @value ||= ''
77
+ @value << text
78
+ end
79
+ end
80
+ def start_document
81
+ @path = XPath::Path.new
82
+ end
83
+ def end_document
84
+
85
+ end
86
+ def start_element(uri, localname, qname, attributes)
87
+ element = XPath::Element.new(localname, attributes)
88
+ @path.elements << element
89
+
90
+ @parser.fields.each do |field|
91
+ #puts "#{@path} match? #{field.path}"
92
+ if @path.match?(field.path)
93
+ #puts "field.path: #{field.path}"
94
+ if field.path.is_attribute?
95
+ #puts "setting @row[#{field.name}] to #{element.attributes[field.path.attribute]}"
96
+ @row[field.name] = element.attributes[field.path.attribute]
97
+ end
98
+ end
99
+ end
100
+ end
101
+ def end_element(uri, localname, qname)
102
+ element = @path.elements.last
103
+
104
+ @parser.fields.each do |field|
105
+ #puts "#{@path} match? #{field.path}"
106
+ if @path.match?(field.path)
107
+ #puts "field.path: #{field.path}"
108
+ if !field.path.is_attribute?
109
+ @row[field.name] = @value
110
+ end
111
+ end
112
+ end
113
+
114
+ #puts @path.to_s
115
+ if @path.match?(@parser.write_trigger)
116
+ #puts "matched: #{@path} =~ #{@parser.write_trigger}"
117
+ #puts "calling proc with #{@row.inspect}"
118
+ @proc.call(@row.clone)
119
+ end
120
+
121
+ @value = nil
122
+ @path.elements.pop
123
+ end
124
+ def progress(position)
125
+ @position = position
126
+ end
127
+ end
128
+
129
+ # Module which contains classes that are used for XPath-like filtering
130
+ # on the SAX parser
131
+ module XPath #:nodoc:
132
+ class Path #:nodoc:
133
+ # Get the elements in the path
134
+ attr_accessor :elements
135
+
136
+ # Initialize
137
+ def initialize
138
+ @elements = []
139
+ end
140
+
141
+ # Convert to a string representation
142
+ def to_s
143
+ @elements.map{ |e| e.to_s }.join("/")
144
+ end
145
+
146
+ # Returns true if the last part of the path refers to an attribute
147
+ def is_attribute?
148
+ elements.last.attributes.length > 0
149
+ end
150
+
151
+ # Return the name of the attribute referenced by the last element in this path. Returns nil if the last element
152
+ # does not reference an attribute.
153
+ #
154
+ # Warning: the path must only reference a single attribute, otherwise the result of this method will be random,
155
+ # since attributes are stored in a Hash.
156
+ def attribute
157
+ return nil unless is_attribute?
158
+ elements.last.attributes.keys.first
159
+ end
160
+
161
+ # Return true if this XPath::Path matches the given path string. This is a fail-fast match, so the first mismatch
162
+ # will cause the method to return false.
163
+ def match?(s)
164
+ path = Path.parse(s)
165
+ return false unless path.elements.length == elements.length
166
+ elements.each_with_index do |element, index|
167
+ path_element = path.elements[index]
168
+ return false if path_element.nil?
169
+ return false if element.name != path_element.name
170
+ path_element.attributes.each do |key, value|
171
+ return false unless element.attributes[key] =~ value
172
+ end
173
+ end
174
+ return true
175
+ end
176
+
177
+ # Parse the string into an XPath::Path object
178
+ def self.parse(s)
179
+ return s if s.is_a?(Path)
180
+ path = Path.new
181
+ parts = s.split('/')
182
+ parts.each_with_index do |part, i|
183
+ attributes = {}
184
+ part.gsub!(/(.*)\[(.*)\]/, '\1')
185
+ if !$2.nil?
186
+ $2.split(",").each do |pair|
187
+ key, value = pair.split("=")
188
+ value = ".*" if value.nil?
189
+ attributes[key] = Regexp.new(value)
190
+ end
191
+ end
192
+ path.elements << Element.new(part, attributes)
193
+ end
194
+ path
195
+ end
196
+ end
197
+ class Element #:nodoc
198
+ attr_reader :name
199
+ attr_reader :attributes
200
+ def initialize(name, attributes={})
201
+ @name = name
202
+ @attributes = attributes
203
+ end
204
+ def to_s
205
+ s = "#{name}"
206
+ if !@attributes.empty?
207
+ attr_str = @attributes.collect do |key,value|
208
+ value = value.source if value.is_a?(Regexp)
209
+ "#{key}=#{value}"
210
+ end.join(",")
211
+ s << "[" + attr_str + "]"
212
+ end
213
+ s
214
+ end
215
+ end
216
+ end
217
+ end
218
+ end