xml_data_extractor 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.editorconfig +8 -0
- data/.github/workflows/ci.yml +5 -5
- data/.gitignore +1 -0
- data/Gemfile.lock +23 -23
- data/README.md +290 -1
- data/lib/src/extract/array_of.rb +2 -0
- data/lib/src/extract/array_value.rb +2 -0
- data/lib/src/extract/base.rb +2 -0
- data/lib/src/extract/expression.rb +2 -0
- data/lib/src/extract/hash_builder.rb +2 -0
- data/lib/src/extract/string_value.rb +2 -0
- data/lib/src/extract/unescape.rb +2 -0
- data/lib/src/extract/value_builder.rb +3 -1
- data/lib/src/extract/within.rb +2 -0
- data/lib/src/extractor.rb +3 -1
- data/lib/src/format/formatter.rb +2 -0
- data/lib/src/format/mapper.rb +2 -0
- data/lib/src/format/modifier.rb +2 -0
- data/lib/src/node.rb +2 -0
- data/xml_data_extractor.gemspec +2 -2
- metadata +5 -5
- data/.travis.yml +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 334896bd178759618062d648f74af638a88fde5c5cccfaf255279589207670a6
|
4
|
+
data.tar.gz: 6b85212f452f62bfa75a97c66f76c889cc39382d726d26b93a05800eb69e6dbe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6899c3dafed6462fcb816edbe341a33e7a7388b2f3ba2724d5c2e0bab190f7ce00256c8bda35fbeaed7eaeb58ccfad8c3597c94a1e332ef7e5c125efd6a50924
|
7
|
+
data.tar.gz: 1b0b37b90adba98c9b2085d6a300473dca39bcafd4c801c58a5877ae57ad518be8d6079d0c61725f73809779ea3ace7590e4258494e2d1e0a61b7e914f8e5f69
|
data/.editorconfig
ADDED
data/.github/workflows/ci.yml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
name: ci
|
2
|
-
|
3
|
-
|
2
|
+
|
3
|
+
on: [push, pull_request]
|
4
4
|
|
5
5
|
jobs:
|
6
6
|
qa:
|
@@ -8,7 +8,7 @@ jobs:
|
|
8
8
|
runs-on: ubuntu-latest
|
9
9
|
steps:
|
10
10
|
- name: Checkout code
|
11
|
-
uses: actions/checkout@
|
11
|
+
uses: actions/checkout@v3
|
12
12
|
|
13
13
|
- name: Freeze autogenerated files
|
14
14
|
run: |
|
@@ -18,7 +18,7 @@ jobs:
|
|
18
18
|
uses: ruby/setup-ruby@v1
|
19
19
|
|
20
20
|
- name: Cache Ruby Dependencies
|
21
|
-
uses: actions/cache@
|
21
|
+
uses: actions/cache@v3
|
22
22
|
with:
|
23
23
|
path: vendor/bundle
|
24
24
|
key: ${{ runner.os }}-gem-${{ hashFiles('.ruby-version') }}-${{ hashFiles('**/Gemfile.lock') }}
|
@@ -28,4 +28,4 @@ jobs:
|
|
28
28
|
|
29
29
|
- name: Run tests
|
30
30
|
run: |
|
31
|
-
bin/rspec
|
31
|
+
bin/rspec
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,46 +1,46 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
xml_data_extractor (0.
|
4
|
+
xml_data_extractor (0.6.0)
|
5
5
|
activesupport (~> 6.0)
|
6
6
|
nokogiri (~> 1.0)
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
activesupport (6.1.
|
11
|
+
activesupport (6.1.5)
|
12
12
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
13
13
|
i18n (>= 1.6, < 2)
|
14
14
|
minitest (>= 5.1)
|
15
15
|
tzinfo (~> 2.0)
|
16
16
|
zeitwerk (~> 2.3)
|
17
|
-
concurrent-ruby (1.1.
|
18
|
-
diff-lcs (1.
|
19
|
-
i18n (1.
|
17
|
+
concurrent-ruby (1.1.10)
|
18
|
+
diff-lcs (1.5.0)
|
19
|
+
i18n (1.10.0)
|
20
20
|
concurrent-ruby (~> 1.0)
|
21
|
-
mini_portile2 (2.
|
22
|
-
minitest (5.
|
23
|
-
nokogiri (1.
|
24
|
-
mini_portile2 (~> 2.
|
21
|
+
mini_portile2 (2.8.0)
|
22
|
+
minitest (5.15.0)
|
23
|
+
nokogiri (1.13.3)
|
24
|
+
mini_portile2 (~> 2.8.0)
|
25
25
|
racc (~> 1.4)
|
26
|
-
racc (1.
|
26
|
+
racc (1.6.0)
|
27
27
|
rake (12.3.3)
|
28
|
-
rspec (3.
|
29
|
-
rspec-core (~> 3.
|
30
|
-
rspec-expectations (~> 3.
|
31
|
-
rspec-mocks (~> 3.
|
32
|
-
rspec-core (3.
|
33
|
-
rspec-support (~> 3.
|
34
|
-
rspec-expectations (3.
|
28
|
+
rspec (3.11.0)
|
29
|
+
rspec-core (~> 3.11.0)
|
30
|
+
rspec-expectations (~> 3.11.0)
|
31
|
+
rspec-mocks (~> 3.11.0)
|
32
|
+
rspec-core (3.11.0)
|
33
|
+
rspec-support (~> 3.11.0)
|
34
|
+
rspec-expectations (3.11.0)
|
35
35
|
diff-lcs (>= 1.2.0, < 2.0)
|
36
|
-
rspec-support (~> 3.
|
37
|
-
rspec-mocks (3.
|
36
|
+
rspec-support (~> 3.11.0)
|
37
|
+
rspec-mocks (3.11.0)
|
38
38
|
diff-lcs (>= 1.2.0, < 2.0)
|
39
|
-
rspec-support (~> 3.
|
40
|
-
rspec-support (3.
|
39
|
+
rspec-support (~> 3.11.0)
|
40
|
+
rspec-support (3.11.0)
|
41
41
|
tzinfo (2.0.4)
|
42
42
|
concurrent-ruby (~> 1.0)
|
43
|
-
zeitwerk (2.4
|
43
|
+
zeitwerk (2.5.4)
|
44
44
|
|
45
45
|
PLATFORMS
|
46
46
|
ruby
|
@@ -51,4 +51,4 @@ DEPENDENCIES
|
|
51
51
|
xml_data_extractor!
|
52
52
|
|
53
53
|
BUNDLED WITH
|
54
|
-
2.
|
54
|
+
2.2.6
|
data/README.md
CHANGED
@@ -300,6 +300,295 @@ schemas:
|
|
300
300
|
}
|
301
301
|
```
|
302
302
|
|
303
|
+
### link
|
304
|
+
|
305
|
+
This command is useful when the XML contains references to other nodes, it works as a SQL JOIN. The path must be and expression containing the `<link>` identifier, which will be replaced by the value fetched from the `link:` command.
|
306
|
+
|
307
|
+
Example:
|
308
|
+
```yml
|
309
|
+
schemas:
|
310
|
+
bookings:
|
311
|
+
array_of: booking
|
312
|
+
date: booking_date
|
313
|
+
document: id
|
314
|
+
products:
|
315
|
+
array_of:
|
316
|
+
accomodation:
|
317
|
+
path: ../hotel[booking_id=<link>]/accomodation
|
318
|
+
link: id
|
319
|
+
```
|
320
|
+
```xml
|
321
|
+
<xml>
|
322
|
+
<booking>
|
323
|
+
<id>1</id>
|
324
|
+
<booking_date>2020-01-01</booking_date>
|
325
|
+
</booking>
|
326
|
+
<booking>
|
327
|
+
<id>2</id>
|
328
|
+
<booking_date>2020-01-02</booking_date>
|
329
|
+
</booking>
|
330
|
+
<hotel>
|
331
|
+
<booking_id>1</booking_id>
|
332
|
+
<accomodation>Standard</accomodation>
|
333
|
+
</hotel>
|
334
|
+
<hotel>
|
335
|
+
<booking_id>2</booking_id>
|
336
|
+
<accomodation>Premium</accomodation>
|
337
|
+
</hotel>
|
338
|
+
</xml>
|
339
|
+
```
|
340
|
+
```ruby
|
341
|
+
{
|
342
|
+
bookings: [
|
343
|
+
{
|
344
|
+
date: "2020-01-01",
|
345
|
+
document: "1"
|
346
|
+
products: [
|
347
|
+
{ accomodation: "Standard" }
|
348
|
+
]
|
349
|
+
},
|
350
|
+
{
|
351
|
+
date: "2020-01-02",
|
352
|
+
document: "2"
|
353
|
+
products: [
|
354
|
+
{ accomodation: "Premium" }
|
355
|
+
]
|
356
|
+
}
|
357
|
+
]
|
358
|
+
}
|
359
|
+
```
|
360
|
+
|
361
|
+
In this example if I didn't use the `link` to get only the hotel of each booking, it would have returned two accomodations for each booking and instead of extract a string with the accomodation it would extract an array with all the accomodations for each booking.
|
362
|
+
|
363
|
+
You can combine the `link` with `array_of` if you want search for a list of elements filtering by some field, just provide the `path` and the `link`:
|
364
|
+
|
365
|
+
```yml
|
366
|
+
schemas:
|
367
|
+
bookings:
|
368
|
+
array_of: booking
|
369
|
+
date: date
|
370
|
+
document: id
|
371
|
+
products:
|
372
|
+
array_of:
|
373
|
+
path: ../products[booking_id=<link>]
|
374
|
+
link: id
|
375
|
+
....
|
376
|
+
```
|
377
|
+
|
378
|
+
### uniq_by
|
379
|
+
|
380
|
+
Can only be used with **array_of**.
|
381
|
+
|
382
|
+
This functionality is useful when some XML nodes are duplicated and you want to extract data from the first occurrence only. It has a behavior similar to Ruby **uniq** method on arrays.
|
383
|
+
For each path generated from `array_of`, the value fetched using `uniq_by` will be checked against the generated collection and the path will be discarded if the value already exists.
|
384
|
+
|
385
|
+
```yml
|
386
|
+
schemas:
|
387
|
+
bookings:
|
388
|
+
array_of:
|
389
|
+
path: booking
|
390
|
+
uniq_by: id
|
391
|
+
date: bdate
|
392
|
+
document: id
|
393
|
+
```
|
394
|
+
```xml
|
395
|
+
<xml>
|
396
|
+
<booking>
|
397
|
+
<id>1</id>
|
398
|
+
<bdate>2020-01-01</bdate>
|
399
|
+
</booking>
|
400
|
+
<booking>
|
401
|
+
<id>1</id>
|
402
|
+
<bdate>2020-01-01</bdate>
|
403
|
+
</booking>
|
404
|
+
</xml>
|
405
|
+
```
|
406
|
+
```ruby
|
407
|
+
{
|
408
|
+
bookings: [
|
409
|
+
{
|
410
|
+
date: "2020-01-01",
|
411
|
+
document: "1"
|
412
|
+
}
|
413
|
+
]
|
414
|
+
}
|
415
|
+
```
|
416
|
+
|
417
|
+
In this example if we don't use the tag `uniq_by` there would be extracted two elements with the same data, like:
|
418
|
+
|
419
|
+
```ruby
|
420
|
+
{
|
421
|
+
bookings: [
|
422
|
+
{
|
423
|
+
date: "2020-01-01",
|
424
|
+
document: "1"
|
425
|
+
},
|
426
|
+
{
|
427
|
+
date: "2020-01-01",
|
428
|
+
document: "1"
|
429
|
+
}
|
430
|
+
]
|
431
|
+
}
|
432
|
+
```
|
433
|
+
|
434
|
+
### array_presence: first_only
|
435
|
+
|
436
|
+
The field that contains this property will be only added to the first item of the array.
|
437
|
+
|
438
|
+
Can only be used in fields that belong to a node of `array_of`.
|
439
|
+
|
440
|
+
```yml
|
441
|
+
passengers:
|
442
|
+
array_of: bookings/booking/passengers/passenger
|
443
|
+
id:
|
444
|
+
path: document
|
445
|
+
modifier: to_s
|
446
|
+
name:
|
447
|
+
attr: [FirstName, LastName]
|
448
|
+
modifier:
|
449
|
+
- name: join
|
450
|
+
params: [" "]
|
451
|
+
rav_tax:
|
452
|
+
array_presence: first_only
|
453
|
+
path: ../rav
|
454
|
+
modifier: to_f
|
455
|
+
```
|
456
|
+
```xml
|
457
|
+
<bookings>
|
458
|
+
<booking>
|
459
|
+
<rav>150<rav>
|
460
|
+
<passengers>
|
461
|
+
<passenger>
|
462
|
+
<document>109.111.019-79</document>
|
463
|
+
<FirstName>Marcelo</FirstName>
|
464
|
+
<LastName>Lauxen</LastName>
|
465
|
+
</passenger>
|
466
|
+
<passenger>
|
467
|
+
<document>110.155.019-78</document>
|
468
|
+
<FirstName>Corona</FirstName>
|
469
|
+
<LastName>Virus</LastName>
|
470
|
+
</passenger>
|
471
|
+
</passengers>
|
472
|
+
</booking>
|
473
|
+
</bookings>
|
474
|
+
```
|
475
|
+
```ruby
|
476
|
+
{
|
477
|
+
bookings: [
|
478
|
+
{
|
479
|
+
passengers: [
|
480
|
+
{
|
481
|
+
id: "109.111.019-79",
|
482
|
+
name: "Marcelo Lauxen",
|
483
|
+
tax_rav: 150.00
|
484
|
+
},
|
485
|
+
{
|
486
|
+
id: "110.155.019-78",
|
487
|
+
name: "Corona Virus"
|
488
|
+
}
|
489
|
+
]
|
490
|
+
}
|
491
|
+
]
|
492
|
+
}
|
493
|
+
```
|
494
|
+
|
495
|
+
In this example the field `tax_rav` was only included on the first passenger because this field has the `array_presence: first_only` property.
|
496
|
+
|
497
|
+
### in_parent
|
498
|
+
|
499
|
+
This option allows you to navigate to a parent node of the current node.
|
500
|
+
|
501
|
+
```yml
|
502
|
+
passengers:
|
503
|
+
array_of: bookings/booking/passengers/passenger
|
504
|
+
id:
|
505
|
+
path: document
|
506
|
+
modifier: to_s
|
507
|
+
bookings_id:
|
508
|
+
in_parent: bookings
|
509
|
+
path: id
|
510
|
+
```
|
511
|
+
```xml
|
512
|
+
<bookings>
|
513
|
+
<bookings_id>8888</bookings_id>
|
514
|
+
<booking>
|
515
|
+
<passengers>
|
516
|
+
<passenger>
|
517
|
+
<document>109.111.019-79</document>
|
518
|
+
</passenger>
|
519
|
+
<passenger>
|
520
|
+
<document>110.155.019-78</document>
|
521
|
+
</passenger>
|
522
|
+
</passengers>
|
523
|
+
</booking>
|
524
|
+
</bookings>
|
525
|
+
```
|
526
|
+
```ruby
|
527
|
+
{
|
528
|
+
bookings: [
|
529
|
+
{
|
530
|
+
passengers: [
|
531
|
+
{
|
532
|
+
id: "109.111.019-79",
|
533
|
+
bookings_id: 8888
|
534
|
+
},
|
535
|
+
{
|
536
|
+
id: "110.155.019-78",
|
537
|
+
bookings_id: 8888
|
538
|
+
}
|
539
|
+
]
|
540
|
+
}
|
541
|
+
]
|
542
|
+
}
|
543
|
+
```
|
544
|
+
|
545
|
+
In this example the value of `bookings_id` will be extracted starting at the node provided in `in_parent` instead of the current node. It's possible to navigate to a parent node with `../` too (xpath provides this functionality), but using `in_parent` you just need to provide the name of the parent node, it will navigate up until the parent node is found, no matter how many levels.
|
546
|
+
|
547
|
+
### keep_if
|
548
|
+
|
549
|
+
This option allows you to keep the part of the block of the hash in the final result only if the condition matches.
|
550
|
+
|
551
|
+
```yml
|
552
|
+
schemas:
|
553
|
+
dummy:
|
554
|
+
within: data
|
555
|
+
description: additional_desc
|
556
|
+
exchange: currency_info/value
|
557
|
+
price: price
|
558
|
+
payment:
|
559
|
+
type: payment_info/method
|
560
|
+
value: payment_info/price
|
561
|
+
keep_if: "'type' == 'invoice'"
|
562
|
+
```
|
563
|
+
```xml
|
564
|
+
<data>
|
565
|
+
<additional_desc>Keep walking</additional_desc>
|
566
|
+
<currency_info kind="USD">
|
567
|
+
<value>4.15</value>
|
568
|
+
</currency_info>
|
569
|
+
<price>55.09</price>
|
570
|
+
<payment_info>
|
571
|
+
<method>card</method>
|
572
|
+
<price>55.48</price>
|
573
|
+
<payment>
|
574
|
+
<installments>2</installments>
|
575
|
+
<card_number>333</card_number>
|
576
|
+
</payment>
|
577
|
+
</payment>
|
578
|
+
<data>
|
579
|
+
```
|
580
|
+
```ruby
|
581
|
+
{
|
582
|
+
dummy: {
|
583
|
+
description: "Keep walking",
|
584
|
+
exchange: "4.15",
|
585
|
+
price: "55.09"
|
586
|
+
}
|
587
|
+
}
|
588
|
+
```
|
589
|
+
|
590
|
+
In this example the condition didn't match since the payment method was `card` instead of `invoice` and then the extracted payment hash was removed from the final result.
|
591
|
+
|
303
592
|
### Formatting:
|
304
593
|
|
305
594
|
#### fixed
|
@@ -379,7 +668,7 @@ schemas:
|
|
379
668
|
path: [firstname, lastname]
|
380
669
|
modifier:
|
381
670
|
- name: join
|
382
|
-
params: [" "]
|
671
|
+
params: [" "]
|
383
672
|
- downcase
|
384
673
|
```
|
385
674
|
```xml
|
data/lib/src/extract/array_of.rb
CHANGED
data/lib/src/extract/base.rb
CHANGED
data/lib/src/extract/unescape.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative "base"
|
2
4
|
require_relative "array_value"
|
3
5
|
require_relative "array_of"
|
@@ -25,7 +27,7 @@ module Extract
|
|
25
27
|
|
26
28
|
def value_for_hash
|
27
29
|
props = node.props
|
28
|
-
|
30
|
+
|
29
31
|
Unescape.new(node, extractor).unescape! if props[:unescape]
|
30
32
|
|
31
33
|
fixed_value = props[:fixed]
|
data/lib/src/extract/within.rb
CHANGED
data/lib/src/extractor.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require "cgi"
|
2
4
|
require "active_support/core_ext/string"
|
3
5
|
require_relative "format/formatter"
|
@@ -37,7 +39,7 @@ class PathBuilder < Struct.new(:base, :parent, :tag, keyword_init: true)
|
|
37
39
|
end
|
38
40
|
|
39
41
|
def matching_tags?(item, tag)
|
40
|
-
item.gsub(/\[\d
|
42
|
+
item.gsub(/\[\d+\]/, "") == tag
|
41
43
|
end
|
42
44
|
end
|
43
45
|
|
data/lib/src/format/formatter.rb
CHANGED
data/lib/src/format/mapper.rb
CHANGED
data/lib/src/format/modifier.rb
CHANGED
data/lib/src/node.rb
CHANGED
data/xml_data_extractor.gemspec
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "xml_data_extractor"
|
3
|
-
spec.version = "0.
|
3
|
+
spec.version = "0.6.0"
|
4
4
|
spec.authors = ["Fernando Almeida"]
|
5
5
|
spec.email = ["fernandoprsbr@gmail.com"]
|
6
6
|
|
7
7
|
spec.summary = "Provides a simples DSL for extracting data from XML documents"
|
8
8
|
spec.homepage = "https://github.com/monde-sistemas/xml_data_extractor"
|
9
9
|
spec.license = "MIT"
|
10
|
-
spec.required_ruby_version = Gem::Requirement.new(">= 2.
|
10
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.5")
|
11
11
|
|
12
12
|
spec.metadata["homepage_uri"] = spec.homepage
|
13
13
|
spec.metadata["source_code_uri"] = spec.homepage
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xml_data_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Fernando Almeida
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-04-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -73,13 +73,13 @@ executables: []
|
|
73
73
|
extensions: []
|
74
74
|
extra_rdoc_files: []
|
75
75
|
files:
|
76
|
+
- ".editorconfig"
|
76
77
|
- ".github/dependabot.yml"
|
77
78
|
- ".github/workflows/ci.yml"
|
78
79
|
- ".gitignore"
|
79
80
|
- ".kodiak.toml"
|
80
81
|
- ".rspec"
|
81
82
|
- ".ruby-version"
|
82
|
-
- ".travis.yml"
|
83
83
|
- Gemfile
|
84
84
|
- Gemfile.lock
|
85
85
|
- LICENSE.txt
|
@@ -119,14 +119,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
119
119
|
requirements:
|
120
120
|
- - ">="
|
121
121
|
- !ruby/object:Gem::Version
|
122
|
-
version: 2.
|
122
|
+
version: '2.5'
|
123
123
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
124
|
requirements:
|
125
125
|
- - ">="
|
126
126
|
- !ruby/object:Gem::Version
|
127
127
|
version: '0'
|
128
128
|
requirements: []
|
129
|
-
rubygems_version: 3.2.
|
129
|
+
rubygems_version: 3.2.25
|
130
130
|
signing_key:
|
131
131
|
specification_version: 4
|
132
132
|
summary: Provides a simples DSL for extracting data from XML documents
|