xml_data_extractor 0.5.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +8 -0
- data/.github/workflows/ci.yml +5 -5
- data/.gitignore +1 -0
- data/Gemfile.lock +23 -23
- data/README.md +290 -1
- data/lib/src/extract/array_of.rb +2 -0
- data/lib/src/extract/array_value.rb +2 -0
- data/lib/src/extract/base.rb +2 -0
- data/lib/src/extract/expression.rb +2 -0
- data/lib/src/extract/hash_builder.rb +2 -0
- data/lib/src/extract/string_value.rb +2 -0
- data/lib/src/extract/unescape.rb +2 -0
- data/lib/src/extract/value_builder.rb +3 -1
- data/lib/src/extract/within.rb +2 -0
- data/lib/src/extractor.rb +3 -1
- data/lib/src/format/formatter.rb +2 -0
- data/lib/src/format/mapper.rb +2 -0
- data/lib/src/format/modifier.rb +2 -0
- data/lib/src/node.rb +2 -0
- data/xml_data_extractor.gemspec +2 -2
- metadata +5 -5
- data/.travis.yml +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 334896bd178759618062d648f74af638a88fde5c5cccfaf255279589207670a6
|
4
|
+
data.tar.gz: 6b85212f452f62bfa75a97c66f76c889cc39382d726d26b93a05800eb69e6dbe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6899c3dafed6462fcb816edbe341a33e7a7388b2f3ba2724d5c2e0bab190f7ce00256c8bda35fbeaed7eaeb58ccfad8c3597c94a1e332ef7e5c125efd6a50924
|
7
|
+
data.tar.gz: 1b0b37b90adba98c9b2085d6a300473dca39bcafd4c801c58a5877ae57ad518be8d6079d0c61725f73809779ea3ace7590e4258494e2d1e0a61b7e914f8e5f69
|
data/.editorconfig
ADDED
data/.github/workflows/ci.yml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
name: ci
|
2
|
-
|
3
|
-
|
2
|
+
|
3
|
+
on: [push, pull_request]
|
4
4
|
|
5
5
|
jobs:
|
6
6
|
qa:
|
@@ -8,7 +8,7 @@ jobs:
|
|
8
8
|
runs-on: ubuntu-latest
|
9
9
|
steps:
|
10
10
|
- name: Checkout code
|
11
|
-
uses: actions/checkout@
|
11
|
+
uses: actions/checkout@v3
|
12
12
|
|
13
13
|
- name: Freeze autogenerated files
|
14
14
|
run: |
|
@@ -18,7 +18,7 @@ jobs:
|
|
18
18
|
uses: ruby/setup-ruby@v1
|
19
19
|
|
20
20
|
- name: Cache Ruby Dependencies
|
21
|
-
uses: actions/cache@
|
21
|
+
uses: actions/cache@v3
|
22
22
|
with:
|
23
23
|
path: vendor/bundle
|
24
24
|
key: ${{ runner.os }}-gem-${{ hashFiles('.ruby-version') }}-${{ hashFiles('**/Gemfile.lock') }}
|
@@ -28,4 +28,4 @@ jobs:
|
|
28
28
|
|
29
29
|
- name: Run tests
|
30
30
|
run: |
|
31
|
-
bin/rspec
|
31
|
+
bin/rspec
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,46 +1,46 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
xml_data_extractor (0.
|
4
|
+
xml_data_extractor (0.6.0)
|
5
5
|
activesupport (~> 6.0)
|
6
6
|
nokogiri (~> 1.0)
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
activesupport (6.1.
|
11
|
+
activesupport (6.1.5)
|
12
12
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
13
13
|
i18n (>= 1.6, < 2)
|
14
14
|
minitest (>= 5.1)
|
15
15
|
tzinfo (~> 2.0)
|
16
16
|
zeitwerk (~> 2.3)
|
17
|
-
concurrent-ruby (1.1.
|
18
|
-
diff-lcs (1.
|
19
|
-
i18n (1.
|
17
|
+
concurrent-ruby (1.1.10)
|
18
|
+
diff-lcs (1.5.0)
|
19
|
+
i18n (1.10.0)
|
20
20
|
concurrent-ruby (~> 1.0)
|
21
|
-
mini_portile2 (2.
|
22
|
-
minitest (5.
|
23
|
-
nokogiri (1.
|
24
|
-
mini_portile2 (~> 2.
|
21
|
+
mini_portile2 (2.8.0)
|
22
|
+
minitest (5.15.0)
|
23
|
+
nokogiri (1.13.3)
|
24
|
+
mini_portile2 (~> 2.8.0)
|
25
25
|
racc (~> 1.4)
|
26
|
-
racc (1.
|
26
|
+
racc (1.6.0)
|
27
27
|
rake (12.3.3)
|
28
|
-
rspec (3.
|
29
|
-
rspec-core (~> 3.
|
30
|
-
rspec-expectations (~> 3.
|
31
|
-
rspec-mocks (~> 3.
|
32
|
-
rspec-core (3.
|
33
|
-
rspec-support (~> 3.
|
34
|
-
rspec-expectations (3.
|
28
|
+
rspec (3.11.0)
|
29
|
+
rspec-core (~> 3.11.0)
|
30
|
+
rspec-expectations (~> 3.11.0)
|
31
|
+
rspec-mocks (~> 3.11.0)
|
32
|
+
rspec-core (3.11.0)
|
33
|
+
rspec-support (~> 3.11.0)
|
34
|
+
rspec-expectations (3.11.0)
|
35
35
|
diff-lcs (>= 1.2.0, < 2.0)
|
36
|
-
rspec-support (~> 3.
|
37
|
-
rspec-mocks (3.
|
36
|
+
rspec-support (~> 3.11.0)
|
37
|
+
rspec-mocks (3.11.0)
|
38
38
|
diff-lcs (>= 1.2.0, < 2.0)
|
39
|
-
rspec-support (~> 3.
|
40
|
-
rspec-support (3.
|
39
|
+
rspec-support (~> 3.11.0)
|
40
|
+
rspec-support (3.11.0)
|
41
41
|
tzinfo (2.0.4)
|
42
42
|
concurrent-ruby (~> 1.0)
|
43
|
-
zeitwerk (2.4
|
43
|
+
zeitwerk (2.5.4)
|
44
44
|
|
45
45
|
PLATFORMS
|
46
46
|
ruby
|
@@ -51,4 +51,4 @@ DEPENDENCIES
|
|
51
51
|
xml_data_extractor!
|
52
52
|
|
53
53
|
BUNDLED WITH
|
54
|
-
2.
|
54
|
+
2.2.6
|
data/README.md
CHANGED
@@ -300,6 +300,295 @@ schemas:
|
|
300
300
|
}
|
301
301
|
```
|
302
302
|
|
303
|
+
### link
|
304
|
+
|
305
|
+
This command is useful when the XML contains references to other nodes, it works as a SQL JOIN. The path must be and expression containing the `<link>` identifier, which will be replaced by the value fetched from the `link:` command.
|
306
|
+
|
307
|
+
Example:
|
308
|
+
```yml
|
309
|
+
schemas:
|
310
|
+
bookings:
|
311
|
+
array_of: booking
|
312
|
+
date: booking_date
|
313
|
+
document: id
|
314
|
+
products:
|
315
|
+
array_of:
|
316
|
+
accomodation:
|
317
|
+
path: ../hotel[booking_id=<link>]/accomodation
|
318
|
+
link: id
|
319
|
+
```
|
320
|
+
```xml
|
321
|
+
<xml>
|
322
|
+
<booking>
|
323
|
+
<id>1</id>
|
324
|
+
<booking_date>2020-01-01</booking_date>
|
325
|
+
</booking>
|
326
|
+
<booking>
|
327
|
+
<id>2</id>
|
328
|
+
<booking_date>2020-01-02</booking_date>
|
329
|
+
</booking>
|
330
|
+
<hotel>
|
331
|
+
<booking_id>1</booking_id>
|
332
|
+
<accomodation>Standard</accomodation>
|
333
|
+
</hotel>
|
334
|
+
<hotel>
|
335
|
+
<booking_id>2</booking_id>
|
336
|
+
<accomodation>Premium</accomodation>
|
337
|
+
</hotel>
|
338
|
+
</xml>
|
339
|
+
```
|
340
|
+
```ruby
|
341
|
+
{
|
342
|
+
bookings: [
|
343
|
+
{
|
344
|
+
date: "2020-01-01",
|
345
|
+
document: "1"
|
346
|
+
products: [
|
347
|
+
{ accomodation: "Standard" }
|
348
|
+
]
|
349
|
+
},
|
350
|
+
{
|
351
|
+
date: "2020-01-02",
|
352
|
+
document: "2"
|
353
|
+
products: [
|
354
|
+
{ accomodation: "Premium" }
|
355
|
+
]
|
356
|
+
}
|
357
|
+
]
|
358
|
+
}
|
359
|
+
```
|
360
|
+
|
361
|
+
In this example if I didn't use the `link` to get only the hotel of each booking, it would have returned two accomodations for each booking and instead of extract a string with the accomodation it would extract an array with all the accomodations for each booking.
|
362
|
+
|
363
|
+
You can combine the `link` with `array_of` if you want search for a list of elements filtering by some field, just provide the `path` and the `link`:
|
364
|
+
|
365
|
+
```yml
|
366
|
+
schemas:
|
367
|
+
bookings:
|
368
|
+
array_of: booking
|
369
|
+
date: date
|
370
|
+
document: id
|
371
|
+
products:
|
372
|
+
array_of:
|
373
|
+
path: ../products[booking_id=<link>]
|
374
|
+
link: id
|
375
|
+
....
|
376
|
+
```
|
377
|
+
|
378
|
+
### uniq_by
|
379
|
+
|
380
|
+
Can only be used with **array_of**.
|
381
|
+
|
382
|
+
This functionality is useful when some XML nodes are duplicated and you want to extract data from the first occurrence only. It has a behavior similar to Ruby **uniq** method on arrays.
|
383
|
+
For each path generated from `array_of`, the value fetched using `uniq_by` will be checked against the generated collection and the path will be discarded if the value already exists.
|
384
|
+
|
385
|
+
```yml
|
386
|
+
schemas:
|
387
|
+
bookings:
|
388
|
+
array_of:
|
389
|
+
path: booking
|
390
|
+
uniq_by: id
|
391
|
+
date: bdate
|
392
|
+
document: id
|
393
|
+
```
|
394
|
+
```xml
|
395
|
+
<xml>
|
396
|
+
<booking>
|
397
|
+
<id>1</id>
|
398
|
+
<bdate>2020-01-01</bdate>
|
399
|
+
</booking>
|
400
|
+
<booking>
|
401
|
+
<id>1</id>
|
402
|
+
<bdate>2020-01-01</bdate>
|
403
|
+
</booking>
|
404
|
+
</xml>
|
405
|
+
```
|
406
|
+
```ruby
|
407
|
+
{
|
408
|
+
bookings: [
|
409
|
+
{
|
410
|
+
date: "2020-01-01",
|
411
|
+
document: "1"
|
412
|
+
}
|
413
|
+
]
|
414
|
+
}
|
415
|
+
```
|
416
|
+
|
417
|
+
In this example if we don't use the tag `uniq_by` there would be extracted two elements with the same data, like:
|
418
|
+
|
419
|
+
```ruby
|
420
|
+
{
|
421
|
+
bookings: [
|
422
|
+
{
|
423
|
+
date: "2020-01-01",
|
424
|
+
document: "1"
|
425
|
+
},
|
426
|
+
{
|
427
|
+
date: "2020-01-01",
|
428
|
+
document: "1"
|
429
|
+
}
|
430
|
+
]
|
431
|
+
}
|
432
|
+
```
|
433
|
+
|
434
|
+
### array_presence: first_only
|
435
|
+
|
436
|
+
The field that contains this property will be only added to the first item of the array.
|
437
|
+
|
438
|
+
Can only be used in fields that belong to a node of `array_of`.
|
439
|
+
|
440
|
+
```yml
|
441
|
+
passengers:
|
442
|
+
array_of: bookings/booking/passengers/passenger
|
443
|
+
id:
|
444
|
+
path: document
|
445
|
+
modifier: to_s
|
446
|
+
name:
|
447
|
+
attr: [FirstName, LastName]
|
448
|
+
modifier:
|
449
|
+
- name: join
|
450
|
+
params: [" "]
|
451
|
+
rav_tax:
|
452
|
+
array_presence: first_only
|
453
|
+
path: ../rav
|
454
|
+
modifier: to_f
|
455
|
+
```
|
456
|
+
```xml
|
457
|
+
<bookings>
|
458
|
+
<booking>
|
459
|
+
<rav>150<rav>
|
460
|
+
<passengers>
|
461
|
+
<passenger>
|
462
|
+
<document>109.111.019-79</document>
|
463
|
+
<FirstName>Marcelo</FirstName>
|
464
|
+
<LastName>Lauxen</LastName>
|
465
|
+
</passenger>
|
466
|
+
<passenger>
|
467
|
+
<document>110.155.019-78</document>
|
468
|
+
<FirstName>Corona</FirstName>
|
469
|
+
<LastName>Virus</LastName>
|
470
|
+
</passenger>
|
471
|
+
</passengers>
|
472
|
+
</booking>
|
473
|
+
</bookings>
|
474
|
+
```
|
475
|
+
```ruby
|
476
|
+
{
|
477
|
+
bookings: [
|
478
|
+
{
|
479
|
+
passengers: [
|
480
|
+
{
|
481
|
+
id: "109.111.019-79",
|
482
|
+
name: "Marcelo Lauxen",
|
483
|
+
tax_rav: 150.00
|
484
|
+
},
|
485
|
+
{
|
486
|
+
id: "110.155.019-78",
|
487
|
+
name: "Corona Virus"
|
488
|
+
}
|
489
|
+
]
|
490
|
+
}
|
491
|
+
]
|
492
|
+
}
|
493
|
+
```
|
494
|
+
|
495
|
+
In this example the field `tax_rav` was only included on the first passenger because this field has the `array_presence: first_only` property.
|
496
|
+
|
497
|
+
### in_parent
|
498
|
+
|
499
|
+
This option allows you to navigate to a parent node of the current node.
|
500
|
+
|
501
|
+
```yml
|
502
|
+
passengers:
|
503
|
+
array_of: bookings/booking/passengers/passenger
|
504
|
+
id:
|
505
|
+
path: document
|
506
|
+
modifier: to_s
|
507
|
+
bookings_id:
|
508
|
+
in_parent: bookings
|
509
|
+
path: id
|
510
|
+
```
|
511
|
+
```xml
|
512
|
+
<bookings>
|
513
|
+
<bookings_id>8888</bookings_id>
|
514
|
+
<booking>
|
515
|
+
<passengers>
|
516
|
+
<passenger>
|
517
|
+
<document>109.111.019-79</document>
|
518
|
+
</passenger>
|
519
|
+
<passenger>
|
520
|
+
<document>110.155.019-78</document>
|
521
|
+
</passenger>
|
522
|
+
</passengers>
|
523
|
+
</booking>
|
524
|
+
</bookings>
|
525
|
+
```
|
526
|
+
```ruby
|
527
|
+
{
|
528
|
+
bookings: [
|
529
|
+
{
|
530
|
+
passengers: [
|
531
|
+
{
|
532
|
+
id: "109.111.019-79",
|
533
|
+
bookings_id: 8888
|
534
|
+
},
|
535
|
+
{
|
536
|
+
id: "110.155.019-78",
|
537
|
+
bookings_id: 8888
|
538
|
+
}
|
539
|
+
]
|
540
|
+
}
|
541
|
+
]
|
542
|
+
}
|
543
|
+
```
|
544
|
+
|
545
|
+
In this example the value of `bookings_id` will be extracted starting at the node provided in `in_parent` instead of the current node. It's possible to navigate to a parent node with `../` too (xpath provides this functionality), but using `in_parent` you just need to provide the name of the parent node, it will navigate up until the parent node is found, no matter how many levels.
|
546
|
+
|
547
|
+
### keep_if
|
548
|
+
|
549
|
+
This option allows you to keep the part of the block of the hash in the final result only if the condition matches.
|
550
|
+
|
551
|
+
```yml
|
552
|
+
schemas:
|
553
|
+
dummy:
|
554
|
+
within: data
|
555
|
+
description: additional_desc
|
556
|
+
exchange: currency_info/value
|
557
|
+
price: price
|
558
|
+
payment:
|
559
|
+
type: payment_info/method
|
560
|
+
value: payment_info/price
|
561
|
+
keep_if: "'type' == 'invoice'"
|
562
|
+
```
|
563
|
+
```xml
|
564
|
+
<data>
|
565
|
+
<additional_desc>Keep walking</additional_desc>
|
566
|
+
<currency_info kind="USD">
|
567
|
+
<value>4.15</value>
|
568
|
+
</currency_info>
|
569
|
+
<price>55.09</price>
|
570
|
+
<payment_info>
|
571
|
+
<method>card</method>
|
572
|
+
<price>55.48</price>
|
573
|
+
<payment>
|
574
|
+
<installments>2</installments>
|
575
|
+
<card_number>333</card_number>
|
576
|
+
</payment>
|
577
|
+
</payment>
|
578
|
+
<data>
|
579
|
+
```
|
580
|
+
```ruby
|
581
|
+
{
|
582
|
+
dummy: {
|
583
|
+
description: "Keep walking",
|
584
|
+
exchange: "4.15",
|
585
|
+
price: "55.09"
|
586
|
+
}
|
587
|
+
}
|
588
|
+
```
|
589
|
+
|
590
|
+
In this example the condition didn't match since the payment method was `card` instead of `invoice` and then the extracted payment hash was removed from the final result.
|
591
|
+
|
303
592
|
### Formatting:
|
304
593
|
|
305
594
|
#### fixed
|
@@ -379,7 +668,7 @@ schemas:
|
|
379
668
|
path: [firstname, lastname]
|
380
669
|
modifier:
|
381
670
|
- name: join
|
382
|
-
params: [" "]
|
671
|
+
params: [" "]
|
383
672
|
- downcase
|
384
673
|
```
|
385
674
|
```xml
|
data/lib/src/extract/array_of.rb
CHANGED
data/lib/src/extract/base.rb
CHANGED
data/lib/src/extract/unescape.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative "base"
|
2
4
|
require_relative "array_value"
|
3
5
|
require_relative "array_of"
|
@@ -25,7 +27,7 @@ module Extract
|
|
25
27
|
|
26
28
|
def value_for_hash
|
27
29
|
props = node.props
|
28
|
-
|
30
|
+
|
29
31
|
Unescape.new(node, extractor).unescape! if props[:unescape]
|
30
32
|
|
31
33
|
fixed_value = props[:fixed]
|
data/lib/src/extract/within.rb
CHANGED
data/lib/src/extractor.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require "cgi"
|
2
4
|
require "active_support/core_ext/string"
|
3
5
|
require_relative "format/formatter"
|
@@ -37,7 +39,7 @@ class PathBuilder < Struct.new(:base, :parent, :tag, keyword_init: true)
|
|
37
39
|
end
|
38
40
|
|
39
41
|
def matching_tags?(item, tag)
|
40
|
-
item.gsub(/\[\d
|
42
|
+
item.gsub(/\[\d+\]/, "") == tag
|
41
43
|
end
|
42
44
|
end
|
43
45
|
|
data/lib/src/format/formatter.rb
CHANGED
data/lib/src/format/mapper.rb
CHANGED
data/lib/src/format/modifier.rb
CHANGED
data/lib/src/node.rb
CHANGED
data/xml_data_extractor.gemspec
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "xml_data_extractor"
|
3
|
-
spec.version = "0.
|
3
|
+
spec.version = "0.6.0"
|
4
4
|
spec.authors = ["Fernando Almeida"]
|
5
5
|
spec.email = ["fernandoprsbr@gmail.com"]
|
6
6
|
|
7
7
|
spec.summary = "Provides a simples DSL for extracting data from XML documents"
|
8
8
|
spec.homepage = "https://github.com/monde-sistemas/xml_data_extractor"
|
9
9
|
spec.license = "MIT"
|
10
|
-
spec.required_ruby_version = Gem::Requirement.new(">= 2.
|
10
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.5")
|
11
11
|
|
12
12
|
spec.metadata["homepage_uri"] = spec.homepage
|
13
13
|
spec.metadata["source_code_uri"] = spec.homepage
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xml_data_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Fernando Almeida
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-04-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -73,13 +73,13 @@ executables: []
|
|
73
73
|
extensions: []
|
74
74
|
extra_rdoc_files: []
|
75
75
|
files:
|
76
|
+
- ".editorconfig"
|
76
77
|
- ".github/dependabot.yml"
|
77
78
|
- ".github/workflows/ci.yml"
|
78
79
|
- ".gitignore"
|
79
80
|
- ".kodiak.toml"
|
80
81
|
- ".rspec"
|
81
82
|
- ".ruby-version"
|
82
|
-
- ".travis.yml"
|
83
83
|
- Gemfile
|
84
84
|
- Gemfile.lock
|
85
85
|
- LICENSE.txt
|
@@ -119,14 +119,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
119
119
|
requirements:
|
120
120
|
- - ">="
|
121
121
|
- !ruby/object:Gem::Version
|
122
|
-
version: 2.
|
122
|
+
version: '2.5'
|
123
123
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
124
|
requirements:
|
125
125
|
- - ">="
|
126
126
|
- !ruby/object:Gem::Version
|
127
127
|
version: '0'
|
128
128
|
requirements: []
|
129
|
-
rubygems_version: 3.2.
|
129
|
+
rubygems_version: 3.2.25
|
130
130
|
signing_key:
|
131
131
|
specification_version: 4
|
132
132
|
summary: Provides a simples DSL for extracting data from XML documents
|