micdrop 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +12 -0
- data/examples/data/catalog.xml +46 -0
- data/examples/data/readme.md +2 -1
- data/examples/xml_to_sql.rb +73 -0
- data/lib/micdrop/ext/microfocus.rb +240 -0
- data/lib/micdrop/ext/nokogiri.rb +180 -0
- data/lib/micdrop/ext/sequel.rb +16 -4
- data/lib/micdrop/item_context.rb +7 -0
- data/lib/micdrop/record_context.rb +32 -2
- data/lib/micdrop/version.rb +1 -1
- metadata +5 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8e567ed6bfc0a336d28ec7f7cee0d3bd987109b83d1ac634071a79d875002824
|
|
4
|
+
data.tar.gz: 0bd90f322d2201b782b3cca29213ffd8ba4078f671ce3a04302e9404ec7a14f7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ee163fb3d7be2fd634e465196fd5053622d9b46246cf5177a75c00bbbdf7cba99c60d7fae661648c652e8a8f500580f3342b7f4fb99764330960c3fecbd62c86
|
|
7
|
+
data.tar.gz: d081c87bcb894e3718290794892be8e48283507615cdeee2b6b47206fb3497f1e45f26d32894b58f7b0cc312031f75c9cd48aca540e3fd35d4d912117cfa75c9
|
data/README.md
CHANGED
|
@@ -446,3 +446,15 @@ Micdrop.migrate source, sink do
|
|
|
446
446
|
end
|
|
447
447
|
```
|
|
448
448
|
|
|
449
|
+
If needed, you can also use the `before_flush` or `after_flush` hooks to add actions before or after the flush. Both take the same form:
|
|
450
|
+
|
|
451
|
+
```ruby
|
|
452
|
+
Micdrop.migrate source, sink do
|
|
453
|
+
after_flush do |record, collected|
|
|
454
|
+
# `record` is the RootRecordContext, `collected` is the hash of `put` values.
|
|
455
|
+
# For example, you could do something like this if the sink was a Sequel InsertSource
|
|
456
|
+
puts "Inserted ID #{record.sink.insert_id} with data #{collected.inspect}"
|
|
457
|
+
end
|
|
458
|
+
# Then do your normal migration operations here
|
|
459
|
+
end
|
|
460
|
+
```
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<catalog>
|
|
3
|
+
<product id="P001">
|
|
4
|
+
<name>Laptop Pro 15</name>
|
|
5
|
+
<category>Electronics</category>
|
|
6
|
+
<price currency="USD">1299.99</price>
|
|
7
|
+
<inStock>true</inStock>
|
|
8
|
+
<quantity>45</quantity>
|
|
9
|
+
<specifications>
|
|
10
|
+
<processor>Intel Core i7-13700H</processor>
|
|
11
|
+
<ram>16GB DDR5</ram>
|
|
12
|
+
<storage>512GB NVMe SSD</storage>
|
|
13
|
+
<display>15.6" FHD IPS</display>
|
|
14
|
+
<graphics>NVIDIA RTX 4060</graphics>
|
|
15
|
+
</specifications>
|
|
16
|
+
<images>
|
|
17
|
+
<image type="thumbnail">laptop-thumb.jpg</image>
|
|
18
|
+
<image type="main">laptop-main.jpg</image>
|
|
19
|
+
</images>
|
|
20
|
+
<ratings>
|
|
21
|
+
<average>4.5</average>
|
|
22
|
+
<count>127</count>
|
|
23
|
+
</ratings>
|
|
24
|
+
</product>
|
|
25
|
+
<product id="P002">
|
|
26
|
+
<name>Wireless Mouse</name>
|
|
27
|
+
<category>Accessories</category>
|
|
28
|
+
<price currency="USD">29.99</price>
|
|
29
|
+
<inStock>true</inStock>
|
|
30
|
+
<quantity>230</quantity>
|
|
31
|
+
<specifications>
|
|
32
|
+
<connectivity>Bluetooth 5.0</connectivity>
|
|
33
|
+
<battery>2x AA</battery>
|
|
34
|
+
<dpi>1600</dpi>
|
|
35
|
+
<buttons>6</buttons>
|
|
36
|
+
</specifications>
|
|
37
|
+
<images>
|
|
38
|
+
<image type="thumbnail">mouse-thumb.jpg</image>
|
|
39
|
+
<image type="main">mouse-main.jpg</image>
|
|
40
|
+
</images>
|
|
41
|
+
<ratings>
|
|
42
|
+
<average>4.7</average>
|
|
43
|
+
<count>89</count>
|
|
44
|
+
</ratings>
|
|
45
|
+
</product>
|
|
46
|
+
</catalog>
|
data/examples/data/readme.md
CHANGED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
$LOAD_PATH.unshift File.expand_path("../lib", __dir__)
|
|
4
|
+
require "micdrop"
|
|
5
|
+
require "sequel"
|
|
6
|
+
require "micdrop/ext/sequel"
|
|
7
|
+
require "micdrop/ext/nokogiri"
|
|
8
|
+
|
|
9
|
+
DB = Sequel.sqlite "test.db"
|
|
10
|
+
|
|
11
|
+
# Create the destination data structure.
|
|
12
|
+
# Obviously in a real import script, these would probably already exist.
|
|
13
|
+
|
|
14
|
+
DB.create_table :products do
|
|
15
|
+
String :code, primary_key: true
|
|
16
|
+
String :name
|
|
17
|
+
String :category
|
|
18
|
+
BigDecimal :price, size: [6, 2]
|
|
19
|
+
FixNum :stock
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
DB.create_table :product_specs do
|
|
23
|
+
String :code
|
|
24
|
+
String :key
|
|
25
|
+
String :value
|
|
26
|
+
primary_key %i[code key]
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Now start the migration
|
|
30
|
+
document = Nokogiri::XML.parse File.open File.join(__dir__, "data/catalog.xml")
|
|
31
|
+
|
|
32
|
+
# Our source will iterate over the <product> elements in the XML document
|
|
33
|
+
source = document.css("product")
|
|
34
|
+
sink = Micdrop::Ext::Sequel::InsertSink.new DB[:products]
|
|
35
|
+
|
|
36
|
+
Micdrop.migrate source, sink do
|
|
37
|
+
# The files source exposes the basename and content as takeable items
|
|
38
|
+
take "id", put: :code
|
|
39
|
+
at_css("name").take_content put: :name
|
|
40
|
+
at_css("category").take_content put: :category
|
|
41
|
+
at_css("price").take_content do
|
|
42
|
+
parse_float
|
|
43
|
+
put :price
|
|
44
|
+
end
|
|
45
|
+
at_css("quantity").take_content put: :stock
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Then over the individual specs
|
|
49
|
+
source = document.css("product")
|
|
50
|
+
sink = Micdrop::Ext::Sequel::InsertSink.new DB[:product_specs]
|
|
51
|
+
|
|
52
|
+
Micdrop.migrate source, sink do
|
|
53
|
+
# The files source exposes the basename and content as takeable items
|
|
54
|
+
code = take "id"
|
|
55
|
+
css("specifications > *").each_subrecord(flush: true, reset: true) do
|
|
56
|
+
code.put :code
|
|
57
|
+
take_node_name do
|
|
58
|
+
lookup({
|
|
59
|
+
"battery" => "Battery",
|
|
60
|
+
"buttons" => "Button Count",
|
|
61
|
+
"connectivity" => "Connectivity",
|
|
62
|
+
"display" => "Screen",
|
|
63
|
+
"dpi" => "Screen DPI",
|
|
64
|
+
"graphics" => "GPU",
|
|
65
|
+
"processor" => "CPU",
|
|
66
|
+
"ram" => "Memory",
|
|
67
|
+
"storage" => "Storage"
|
|
68
|
+
})
|
|
69
|
+
put :key
|
|
70
|
+
end
|
|
71
|
+
take_content.put :value
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
require "date"
|
|
2
|
+
require "forwardable"
|
|
3
|
+
|
|
4
|
+
module Micdrop
|
|
5
|
+
module Ext
|
|
6
|
+
##
|
|
7
|
+
# A simple parser to extract data from a "Micro Focus File with Header (DAT)" file.
|
|
8
|
+
#
|
|
9
|
+
# Based on this spec: https://www.microfocus.com/documentation/server-express/sx20books/fhfile.htm
|
|
10
|
+
#
|
|
11
|
+
# This format comes from old COBOL programs, and each file is conceptually similar to an SQL
|
|
12
|
+
# database table. Unlike SQL though, these DAT files lack type information; each row is raw
|
|
13
|
+
# binary and must be unpacked.
|
|
14
|
+
#
|
|
15
|
+
# This does not implement the full spec, and is not well tested, but "works on my machine".
|
|
16
|
+
module Microfocus
|
|
17
|
+
##
|
|
18
|
+
# A header value that appears at the beginning of each record to determine the record type
|
|
19
|
+
module RecordType
|
|
20
|
+
DUPLICATE_SYSTEM = 0b0001
|
|
21
|
+
DELETED = 0b0010
|
|
22
|
+
SYSTEM = 0b0011
|
|
23
|
+
NORMAL = 0b0100
|
|
24
|
+
REDUCED = 0b0101
|
|
25
|
+
POINTER = 0b0110
|
|
26
|
+
POINTER_REF = 0b0111
|
|
27
|
+
REDUCED_POINTER_REF = 0b1000
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
##
|
|
31
|
+
# Flag indicating how records are organized in the file
|
|
32
|
+
module RecordOrganization
|
|
33
|
+
SEQUENTIAL = 1
|
|
34
|
+
INDEXED = 2
|
|
35
|
+
RELATIVE = 3
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
##
|
|
39
|
+
# Representation of a single record within a file
|
|
40
|
+
class Record
|
|
41
|
+
extend Forwardable
|
|
42
|
+
|
|
43
|
+
def initialize(type, body, unpack_spec: nil, unpack_mapping: nil)
|
|
44
|
+
@type = type
|
|
45
|
+
@body = body
|
|
46
|
+
@fields = nil
|
|
47
|
+
unpack unpack_spec, unpack_mapping unless unpack_spec.nil?
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
attr_reader :type, :body, :fields
|
|
51
|
+
|
|
52
|
+
def_delegators :@fields, :[], :each
|
|
53
|
+
|
|
54
|
+
private
|
|
55
|
+
|
|
56
|
+
def unpack(spec, mapping = nil)
|
|
57
|
+
fields = @body.unpack spec
|
|
58
|
+
fields = if mapping.nil?
|
|
59
|
+
fields
|
|
60
|
+
else
|
|
61
|
+
mapping.transform_values { |value| fields[value] }
|
|
62
|
+
end
|
|
63
|
+
@fields = fields.freeze
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
##
|
|
68
|
+
# Read a MicroFocus data file
|
|
69
|
+
class MicroFocusReader
|
|
70
|
+
def initialize(data_file, unpack_spec: nil, unpack_mapping: nil)
|
|
71
|
+
@data_file = data_file
|
|
72
|
+
@unpack_spec = unpack_spec
|
|
73
|
+
@unpack_mapping = unpack_mapping
|
|
74
|
+
read_data_header
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
attr_reader :creation_time, :compression, :index_type, :variable_length, :min_legth, :max_length, :index_version
|
|
78
|
+
|
|
79
|
+
def long_records?
|
|
80
|
+
@long_records
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def sequential?
|
|
84
|
+
@organization == RecordOrganization::SEQUENTIAL
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def indexed?
|
|
88
|
+
@organization == RecordOrganization::INDEXED
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def relative?
|
|
92
|
+
@organization == RecordOrganization::RELATIVE
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def each
|
|
96
|
+
return enum_for :each unless block_given?
|
|
97
|
+
|
|
98
|
+
yield read_record until @data_file.eof?
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
private
|
|
102
|
+
|
|
103
|
+
def read_data_header
|
|
104
|
+
parse_data_file_header @data_file.read(128)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def read_record
|
|
108
|
+
header = @data_file.read(@long_records ? 4 : 2)
|
|
109
|
+
type = header.unpack1("C") >> 4
|
|
110
|
+
length = header.unpack1(@long_records ? "N" : "n") & (@long_records ? 0xFFFFFFF : 0xFFF)
|
|
111
|
+
body = @data_file.read length
|
|
112
|
+
scan_padding
|
|
113
|
+
Record.new type, body, unpack_spec: @unpack_spec, unpack_mapping: @unpack_mapping
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
##
|
|
117
|
+
# Parse the first four bytes of the header, which are used to determine the record size
|
|
118
|
+
def parse_data_file_header(data)
|
|
119
|
+
# The first 4 bits are the record type, which must be SYSTEM
|
|
120
|
+
type = data.unpack1("C") >> 4
|
|
121
|
+
raise StandardError, "This file does not have a valid header" unless type == RecordType::SYSTEM
|
|
122
|
+
|
|
123
|
+
# The next 12 bits (or 28 bits, depending on the max record size) are the header record size
|
|
124
|
+
length = data.unpack1("n") & 0xFFF
|
|
125
|
+
if length == 126
|
|
126
|
+
# Header data is 126 bytes, max record length is less than 4095 bytes
|
|
127
|
+
@long_records = false
|
|
128
|
+
elsif length == 0
|
|
129
|
+
# Header data is 124 bytes, max record length is 4095 bytes or greater
|
|
130
|
+
length = data.unpack1("N") & 0xFFF
|
|
131
|
+
raise StandardError, "Invalid header record length" unless length == 124
|
|
132
|
+
|
|
133
|
+
@long_records = true
|
|
134
|
+
else
|
|
135
|
+
raise StandardError, "Invalid header record length"
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Regardless of the listed header length, actual header data always at the same byte offsets
|
|
139
|
+
(
|
|
140
|
+
@db_seq,
|
|
141
|
+
integrity, # The specs say this integrity flag is 3 bytes, not 2, but I think the spec must be wrong
|
|
142
|
+
creation_time,
|
|
143
|
+
special62,
|
|
144
|
+
@organization,
|
|
145
|
+
@compression,
|
|
146
|
+
@index_type,
|
|
147
|
+
variable_length,
|
|
148
|
+
@min_legth,
|
|
149
|
+
@max_length,
|
|
150
|
+
@index_version
|
|
151
|
+
) = data.unpack "x4 n n A14 x14 n x C x C x C x C x5 N N x46 N"
|
|
152
|
+
|
|
153
|
+
# Check integrity
|
|
154
|
+
raise StandardError, "Integrity flag non-zero; file is corrupt" if integrity != 0
|
|
155
|
+
raise StandardError, "Bytes 36-37 not equal to 64; file is corrupt" if special62 != 62
|
|
156
|
+
|
|
157
|
+
# Type-cast some of the header values
|
|
158
|
+
@creation_time = DateTime.strptime creation_time[0..11], "%y%m%d%H%M%S"
|
|
159
|
+
@variable_length = !!variable_length.nil?
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
##
|
|
163
|
+
# Scan forward to the next non-null byte
|
|
164
|
+
def scan_padding
|
|
165
|
+
# TODO: This is a work-around because it seems I don't have align_cursor working correctly yet
|
|
166
|
+
return if @data_file.eof?
|
|
167
|
+
|
|
168
|
+
return if @data_file.eof? until @data_file.readbyte.positive?
|
|
169
|
+
@data_file.seek(-1, :CUR)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
##
|
|
173
|
+
# Aligns the file cursor to the next address which is a multiple of the data alignment value
|
|
174
|
+
#
|
|
175
|
+
# Automatically detect the the alignment from the index if not provided
|
|
176
|
+
#
|
|
177
|
+
# Index formats 1 and 2 have no alignment, 3 and 4 are aligned to 4 bytes, and 8 is aligned to 8 bytes
|
|
178
|
+
def align_cursor
|
|
179
|
+
alignment = if @index_type < 3
|
|
180
|
+
return # offset of 1, so we don't need to do anything
|
|
181
|
+
elsif @index_type < 5
|
|
182
|
+
4
|
|
183
|
+
else
|
|
184
|
+
8
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
offset = @data_file.tell % alignment
|
|
188
|
+
@file.seek offset, :CUR if offset
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
##
|
|
193
|
+
# This is the main entrypoint to read a file, and its output is usable as a source.
|
|
194
|
+
#
|
|
195
|
+
# `unpack_spec` is an optional spec, as would be passed to `String#unpack`, to extract the
|
|
196
|
+
# individual columns from the record. You may also provice an `unpack_mapping` which maps more
|
|
197
|
+
# human-readable columns names to column indexes.
|
|
198
|
+
def self.read_microfocus_file(filename, unpack_spec: nil, unpack_mapping: nil)
|
|
199
|
+
File.open filename, "rb" do |file|
|
|
200
|
+
reader = MicroFocusReader.new file, unpack_spec: unpack_spec, unpack_mapping: unpack_mapping
|
|
201
|
+
reader.each.entries
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
##
|
|
208
|
+
# Extend ItemContext with parse_microfocus
|
|
209
|
+
class ItemContext
|
|
210
|
+
##
|
|
211
|
+
# Parse a string as JSON
|
|
212
|
+
#
|
|
213
|
+
# If a block is provided, it will act as a record context where object properties can be taken.
|
|
214
|
+
#
|
|
215
|
+
# If include_header is true, the value will be a hash containing both the header information
|
|
216
|
+
# and the actual records.
|
|
217
|
+
def parse_microfocus(include_header: false, unpack_spec: nil, unpack_mapping: nil, &block)
|
|
218
|
+
return self if @value.nil?
|
|
219
|
+
|
|
220
|
+
reader = Micdrop::Ext::Microfocus::MicroFocusReader.new @value, unpack_spec: unpack_spec,
|
|
221
|
+
unpack_mapping: unpack_mapping
|
|
222
|
+
@value = if include_header
|
|
223
|
+
{
|
|
224
|
+
creation_time: reader.creation_time,
|
|
225
|
+
compression: reader.compression,
|
|
226
|
+
index_type: reader.index_type,
|
|
227
|
+
variable_length: reader.variable_length,
|
|
228
|
+
min_legth: reader.min_legth,
|
|
229
|
+
max_length: reader.max_length,
|
|
230
|
+
index_version: reader.index_version,
|
|
231
|
+
records: reader.each.entries
|
|
232
|
+
}
|
|
233
|
+
else
|
|
234
|
+
reader.each.entries
|
|
235
|
+
end
|
|
236
|
+
enter(&block) unless block.nil?
|
|
237
|
+
self
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
end
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "nokogiri"
|
|
4
|
+
|
|
5
|
+
module Micdrop
|
|
6
|
+
##
|
|
7
|
+
# Extend ItemContext with HTML/XML functions
|
|
8
|
+
class ItemContext
|
|
9
|
+
##
|
|
10
|
+
# Alias for scope.enter.take_content
|
|
11
|
+
def take_content(put: nil, convert: nil, apply: nil, &block)
|
|
12
|
+
scope.enter.take_content(put: put, convert: convert, apply: apply, &block)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
##
|
|
16
|
+
# Alias for scope.enter.take_node_name
|
|
17
|
+
def take_node_name(put: nil, convert: nil, apply: nil, &block)
|
|
18
|
+
scope.enter.take_node_name(put: put, convert: convert, apply: apply, &block)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
##
|
|
22
|
+
# Parse HTML and enter a sub-record context for the root node
|
|
23
|
+
def parse_html(&block)
|
|
24
|
+
doc = @value.nil? ? nil : ::Nokogiri::HTML.parse(@value)
|
|
25
|
+
nokogiri_node_subrecord_helper(doc, block)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
##
|
|
29
|
+
# Parse HTML5 and enter a sub-record context for the root node
|
|
30
|
+
def parse_html5(&block)
|
|
31
|
+
doc = @value.nil? ? nil : ::Nokogiri::HTML5.parse(@value)
|
|
32
|
+
nokogiri_node_subrecord_helper(doc, block)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
##
|
|
36
|
+
# Parse XML and enter a sub-record context for the root node
|
|
37
|
+
def parse_xml(&block)
|
|
38
|
+
doc = @value.nil? ? nil : ::Nokogiri::XML.parse(@value)
|
|
39
|
+
nokogiri_node_subrecord_helper(doc, block)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
##
|
|
43
|
+
# Parse an HTML fragment and enter a sub-record context for the root node
|
|
44
|
+
def parse_html_fragment(&block)
|
|
45
|
+
doc = @value.nil? ? nil : ::Nokogiri::HTML.fragment(@value)
|
|
46
|
+
nokogiri_node_subrecord_helper(doc, block)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
##
|
|
50
|
+
# Parse an HTML5 fragment and enter a sub-record context for the root node
|
|
51
|
+
def parse_html5_fragment(&block)
|
|
52
|
+
doc = @value.nil? ? nil : ::Nokogiri::HTML5.fragment(@value)
|
|
53
|
+
nokogiri_node_subrecord_helper(doc, block)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
##
|
|
57
|
+
# Parse an XML fragment and enter a sub-record context for the root node
|
|
58
|
+
def parse_xml_fragment(&block)
|
|
59
|
+
doc = @value.nil? ? nil : Nokogiri::XML.fragment(@value)
|
|
60
|
+
nokogiri_node_subrecord_helper(doc, block)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
##
|
|
64
|
+
# Decode an HTML entity-encoded string to plain text
|
|
65
|
+
def decode_html
|
|
66
|
+
return self if @value.nil?
|
|
67
|
+
|
|
68
|
+
frag = ::Nokogiri::HTML.fragment @value
|
|
69
|
+
@value = frag.content
|
|
70
|
+
self
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
##
|
|
74
|
+
# Encode a string using HTML entities
|
|
75
|
+
def encode_html(nl2br: false)
|
|
76
|
+
return self if @value.nil?
|
|
77
|
+
|
|
78
|
+
frag = ::Nokogiri::HTML.fragment ""
|
|
79
|
+
frag.content = @value
|
|
80
|
+
@value = frag.to_s
|
|
81
|
+
@value = @value.sub "\n", "<br/>" if nl2br
|
|
82
|
+
self
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
##
|
|
86
|
+
# Decode an HTML5 entity-encoded string to plain text
|
|
87
|
+
def decode_html5
|
|
88
|
+
return self if @value.nil?
|
|
89
|
+
|
|
90
|
+
frag = ::Nokogiri::HTML5.fragment @value
|
|
91
|
+
@value = frag.content
|
|
92
|
+
self
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
##
|
|
96
|
+
# Encode a string using HTML5 entities
|
|
97
|
+
def encode_html5(nl2br: false)
|
|
98
|
+
return self if @value.nil?
|
|
99
|
+
|
|
100
|
+
frag = ::Nokogiri::HTML5.fragment ""
|
|
101
|
+
frag.content = @value
|
|
102
|
+
@value = frag.to_s
|
|
103
|
+
@value = @value.sub "\n", "<br/>" if nl2br
|
|
104
|
+
self
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
##
|
|
108
|
+
# Decode an XML entity-encoded string to plain text
|
|
109
|
+
def decode_xml
|
|
110
|
+
return self if @value.nil?
|
|
111
|
+
|
|
112
|
+
frag = ::Nokogiri::XML.fragment @value
|
|
113
|
+
@value = frag.content
|
|
114
|
+
self
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
##
|
|
118
|
+
# Encode a string using XML entities
|
|
119
|
+
def encode_xml
|
|
120
|
+
return self if @value.nil?
|
|
121
|
+
|
|
122
|
+
frag = ::Nokogiri::XML.fragment ""
|
|
123
|
+
frag.content = @value
|
|
124
|
+
@value = frag.to_s
|
|
125
|
+
self
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
private
|
|
129
|
+
|
|
130
|
+
def nokogiri_node_subrecord_helper(node, block)
|
|
131
|
+
item_ctx = ItemContext.new @record_context, node
|
|
132
|
+
subrec_ctx = SubRecordContext.new item_ctx, @record_context
|
|
133
|
+
subrec_ctx.instance_eval(&block) unless block.nil?
|
|
134
|
+
subrec_ctx
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
##
|
|
139
|
+
# Extend RecordContext with HTML/XML functions
|
|
140
|
+
class RecordContext
|
|
141
|
+
##
|
|
142
|
+
# Take the text content of the XML or HTML node
|
|
143
|
+
def take_content(put: nil, convert: nil, apply: nil, &block)
|
|
144
|
+
value = @record&.content
|
|
145
|
+
process_item_helper(value, put, convert, apply, block)
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
##
|
|
149
|
+
# Take the node name of the XML or HTML node
|
|
150
|
+
def take_node_name(put: nil, convert: nil, apply: nil, &block)
|
|
151
|
+
value = @record&.node_name
|
|
152
|
+
process_item_helper(value, put, convert, apply, block)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def xpath(*args, &block)
|
|
156
|
+
nokogiri_node_subrecord_helper(@record.xpath(*args), block)
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def at_xpath(*args, &block)
|
|
160
|
+
nokogiri_node_subrecord_helper(@record.at_xpath(*args), block)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def css(*args, &block)
|
|
164
|
+
nokogiri_node_subrecord_helper(@record.css(*args), block)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
def at_css(*args, &block)
|
|
168
|
+
nokogiri_node_subrecord_helper(@record.at_css(*args), block)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
private
|
|
172
|
+
|
|
173
|
+
def nokogiri_node_subrecord_helper(node, block)
|
|
174
|
+
item_ctx = ItemContext.new self, node
|
|
175
|
+
subrec_ctx = SubRecordContext.new item_ctx, self
|
|
176
|
+
subrec_ctx.instance_eval(&block) unless block.nil?
|
|
177
|
+
subrec_ctx
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
end
|
data/lib/micdrop/ext/sequel.rb
CHANGED
|
@@ -12,8 +12,10 @@ module Micdrop
|
|
|
12
12
|
@dataset = dataset
|
|
13
13
|
end
|
|
14
14
|
|
|
15
|
+
attr_reader :insert_id
|
|
16
|
+
|
|
15
17
|
def <<(collector)
|
|
16
|
-
@dataset.insert(**collector)
|
|
18
|
+
@insert_id = @dataset.insert(**collector)
|
|
17
19
|
end
|
|
18
20
|
end
|
|
19
21
|
|
|
@@ -56,6 +58,8 @@ module Micdrop
|
|
|
56
58
|
@match_empty_key = match_empty_key
|
|
57
59
|
end
|
|
58
60
|
|
|
61
|
+
attr_reader :insert_id, :was_insert
|
|
62
|
+
|
|
59
63
|
def <<(collector)
|
|
60
64
|
dataset = @dataset
|
|
61
65
|
@key_columns.each do |col|
|
|
@@ -65,9 +69,12 @@ module Micdrop
|
|
|
65
69
|
if existing.count > 1
|
|
66
70
|
raise Micdrop::SinkError, "Key column(s) of this InsertUpdateSink are not unique"
|
|
67
71
|
elsif existing.empty?
|
|
68
|
-
dataset.insert(**collector)
|
|
72
|
+
@insert_id = dataset.insert(**collector)
|
|
73
|
+
@was_insert = true
|
|
69
74
|
else
|
|
70
75
|
dataset.update(**update_merge(existing.first, collector))
|
|
76
|
+
@insert_id = nil
|
|
77
|
+
@was_insert = false
|
|
71
78
|
end
|
|
72
79
|
end
|
|
73
80
|
|
|
@@ -101,10 +108,15 @@ module Micdrop
|
|
|
101
108
|
##
|
|
102
109
|
# Sequel-specific extensions for ItemContext
|
|
103
110
|
class ItemContext
|
|
104
|
-
def db_lookup(dataset, key_col, val_col, pass_if_not_found: false, warn_if_not_found: nil,
|
|
111
|
+
def db_lookup(dataset, key_col, val_col = nil, pass_if_not_found: false, warn_if_not_found: nil,
|
|
112
|
+
apply_if_not_found: nil)
|
|
105
113
|
# TODO: allow registering db_lookups like we do normal lookups
|
|
106
114
|
warn_if_not_found = true if warn_if_not_found.nil? && apply_if_not_found.nil?
|
|
107
|
-
found =
|
|
115
|
+
found = if val_col.nil?
|
|
116
|
+
dataset.where(key_col => @value).first
|
|
117
|
+
else
|
|
118
|
+
dataset.where(key_col => @value).get(val_col)
|
|
119
|
+
end
|
|
108
120
|
if found.nil?
|
|
109
121
|
warn format "Value %s not found in db_lookup", @value if warn_if_not_found
|
|
110
122
|
if !apply_if_not_found.nil?
|
data/lib/micdrop/item_context.rb
CHANGED
|
@@ -38,6 +38,18 @@ module Micdrop
|
|
|
38
38
|
process_item_helper(value, put, convert, apply, block)
|
|
39
39
|
end
|
|
40
40
|
|
|
41
|
+
##
|
|
42
|
+
# Take the entire record as a single item
|
|
43
|
+
def take_whole(put: nil, convert: nil, apply: nil, &block)
|
|
44
|
+
process_item_helper(record, put, convert, apply, block)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
##
|
|
48
|
+
# alias for take_whole.each_subrecord
|
|
49
|
+
def each_subrecord(flush: false, reset: false, &block)
|
|
50
|
+
take_whole.each_subrecord(flush: flush, reset: reset, &block)
|
|
51
|
+
end
|
|
52
|
+
|
|
41
53
|
##
|
|
42
54
|
# A combined take/put shorthand, for migrations where many of the column names are the same
|
|
43
55
|
def passthru(*names)
|
|
@@ -89,8 +101,6 @@ module Micdrop
|
|
|
89
101
|
process_item_helper(value, put, convert, apply, block)
|
|
90
102
|
end
|
|
91
103
|
|
|
92
|
-
# TODO: collect_hash (not sure what the signature of it should be?)
|
|
93
|
-
|
|
94
104
|
##
|
|
95
105
|
# Skip the current record. This is similar to a plain-ruby `next` statement.
|
|
96
106
|
def skip
|
|
@@ -124,6 +134,8 @@ module Micdrop
|
|
|
124
134
|
@loop_item = loop_item
|
|
125
135
|
@record = loop_item
|
|
126
136
|
@loop_index = loop_index
|
|
137
|
+
@before_flush = nil
|
|
138
|
+
@after_flush = nil
|
|
127
139
|
reset
|
|
128
140
|
end
|
|
129
141
|
|
|
@@ -153,7 +165,9 @@ module Micdrop
|
|
|
153
165
|
def flush(reset: true)
|
|
154
166
|
return unless @dirty
|
|
155
167
|
|
|
168
|
+
@before_flush&.call self, @collector
|
|
156
169
|
@sink << @collector
|
|
170
|
+
@after_flush&.call self, @collector
|
|
157
171
|
self.reset if reset
|
|
158
172
|
end
|
|
159
173
|
|
|
@@ -176,6 +190,22 @@ module Micdrop
|
|
|
176
190
|
{}
|
|
177
191
|
end
|
|
178
192
|
end
|
|
193
|
+
|
|
194
|
+
##
|
|
195
|
+
# Allows specifying a hook which will run before flush. The block will receive the record and the collector.
|
|
196
|
+
#
|
|
197
|
+
# Note that this must be called *before* any manual flush occurs to have any effect.
|
|
198
|
+
def before_flush(&block)
|
|
199
|
+
@before_flush = block
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
##
|
|
203
|
+
# Allows specifying a hook which will run after flush. The block will receive the record and the collector.
|
|
204
|
+
#
|
|
205
|
+
# Note that this must be called *before* any manual flush occurs to have any effect.
|
|
206
|
+
def after_flush(&block)
|
|
207
|
+
@after_flush = block
|
|
208
|
+
end
|
|
179
209
|
end
|
|
180
210
|
|
|
181
211
|
##
|
data/lib/micdrop/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: micdrop
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Dominick Johnson
|
|
@@ -27,6 +27,7 @@ files:
|
|
|
27
27
|
- Rakefile
|
|
28
28
|
- TODO.md
|
|
29
29
|
- examples/csvs_to_sql.rb
|
|
30
|
+
- examples/data/catalog.xml
|
|
30
31
|
- examples/data/customers-100.csv
|
|
31
32
|
- examples/data/json/1.json
|
|
32
33
|
- examples/data/json/2.json
|
|
@@ -42,8 +43,11 @@ files:
|
|
|
42
43
|
- examples/data/people-100.csv
|
|
43
44
|
- examples/data/readme.md
|
|
44
45
|
- examples/json_files_to_sql.rb
|
|
46
|
+
- examples/xml_to_sql.rb
|
|
45
47
|
- lib/micdrop.rb
|
|
46
48
|
- lib/micdrop/errors.rb
|
|
49
|
+
- lib/micdrop/ext/microfocus.rb
|
|
50
|
+
- lib/micdrop/ext/nokogiri.rb
|
|
47
51
|
- lib/micdrop/ext/sequel.rb
|
|
48
52
|
- lib/micdrop/files_source.rb
|
|
49
53
|
- lib/micdrop/item_context.rb
|