xml_data_extractor 0.3.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.editorconfig +8 -0
- data/.github/dependabot.yml +25 -0
- data/.github/workflows/ci.yml +31 -0
- data/.gitignore +1 -0
- data/.kodiak.toml +14 -0
- data/.ruby-version +1 -0
- data/Gemfile +0 -3
- data/Gemfile.lock +30 -29
- data/README.md +326 -2
- data/bin/rspec +29 -0
- data/lib/src/extract/array_of.rb +2 -0
- data/lib/src/extract/array_value.rb +2 -0
- data/lib/src/extract/base.rb +2 -0
- data/lib/src/extract/expression.rb +4 -3
- data/lib/src/extract/hash_builder.rb +3 -1
- data/lib/src/extract/string_value.rb +2 -0
- data/lib/src/extract/unescape.rb +14 -0
- data/lib/src/extract/value_builder.rb +6 -0
- data/lib/src/extract/within.rb +2 -0
- data/lib/src/extractor.rb +19 -7
- data/lib/src/format/formatter.rb +2 -0
- data/lib/src/format/mapper.rb +2 -0
- data/lib/src/format/modifier.rb +2 -0
- data/lib/src/node.rb +2 -0
- data/xml_data_extractor.gemspec +3 -2
- metadata +25 -5
- data/.travis.yml +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 334896bd178759618062d648f74af638a88fde5c5cccfaf255279589207670a6
|
4
|
+
data.tar.gz: 6b85212f452f62bfa75a97c66f76c889cc39382d726d26b93a05800eb69e6dbe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6899c3dafed6462fcb816edbe341a33e7a7388b2f3ba2724d5c2e0bab190f7ce00256c8bda35fbeaed7eaeb58ccfad8c3597c94a1e332ef7e5c125efd6a50924
|
7
|
+
data.tar.gz: 1b0b37b90adba98c9b2085d6a300473dca39bcafd4c801c58a5877ae57ad518be8d6079d0c61725f73809779ea3ace7590e4258494e2d1e0a61b7e914f8e5f69
|
data/.editorconfig
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
version: 2
|
2
|
+
updates:
|
3
|
+
- package-ecosystem: bundler
|
4
|
+
directory: "/"
|
5
|
+
schedule:
|
6
|
+
interval: daily
|
7
|
+
time: "08:00"
|
8
|
+
timezone: America/Sao_Paulo
|
9
|
+
open-pull-requests-limit: 10
|
10
|
+
versioning-strategy: lockfile-only
|
11
|
+
labels:
|
12
|
+
- dependencies
|
13
|
+
- ruby
|
14
|
+
- automerge
|
15
|
+
- package-ecosystem: "github-actions"
|
16
|
+
directory: "/"
|
17
|
+
schedule:
|
18
|
+
interval: daily
|
19
|
+
time: "08:00"
|
20
|
+
timezone: America/Sao_Paulo
|
21
|
+
open-pull-requests-limit: 10
|
22
|
+
labels:
|
23
|
+
- dependencies
|
24
|
+
- github-actions
|
25
|
+
- automerge
|
@@ -0,0 +1,31 @@
|
|
1
|
+
name: ci
|
2
|
+
|
3
|
+
on: [push, pull_request]
|
4
|
+
|
5
|
+
jobs:
|
6
|
+
qa:
|
7
|
+
timeout-minutes: 5
|
8
|
+
runs-on: ubuntu-latest
|
9
|
+
steps:
|
10
|
+
- name: Checkout code
|
11
|
+
uses: actions/checkout@v3
|
12
|
+
|
13
|
+
- name: Freeze autogenerated files
|
14
|
+
run: |
|
15
|
+
chmod 0444 Gemfile.lock
|
16
|
+
|
17
|
+
- name: Setup Ruby
|
18
|
+
uses: ruby/setup-ruby@v1
|
19
|
+
|
20
|
+
- name: Cache Ruby Dependencies
|
21
|
+
uses: actions/cache@v3
|
22
|
+
with:
|
23
|
+
path: vendor/bundle
|
24
|
+
key: ${{ runner.os }}-gem-${{ hashFiles('.ruby-version') }}-${{ hashFiles('**/Gemfile.lock') }}
|
25
|
+
|
26
|
+
- name: Setup project
|
27
|
+
run: bin/setup
|
28
|
+
|
29
|
+
- name: Run tests
|
30
|
+
run: |
|
31
|
+
bin/rspec
|
data/.gitignore
CHANGED
data/.kodiak.toml
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# https://github.com/chdsbd/kodiak
|
2
|
+
version = 1
|
3
|
+
|
4
|
+
[approve]
|
5
|
+
auto_approve_usernames = ["dependabot"]
|
6
|
+
|
7
|
+
[merge]
|
8
|
+
method = "squash"
|
9
|
+
delete_branch_on_merge = true
|
10
|
+
|
11
|
+
[merge.message]
|
12
|
+
title = "pull_request_title"
|
13
|
+
include_pr_number = true
|
14
|
+
body = "pull_request_body"
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.7.2
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,45 +1,46 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
xml_data_extractor (0.
|
4
|
+
xml_data_extractor (0.6.0)
|
5
5
|
activesupport (~> 6.0)
|
6
6
|
nokogiri (~> 1.0)
|
7
7
|
|
8
8
|
GEM
|
9
9
|
remote: https://rubygems.org/
|
10
10
|
specs:
|
11
|
-
activesupport (6.
|
11
|
+
activesupport (6.1.5)
|
12
12
|
concurrent-ruby (~> 1.0, >= 1.0.2)
|
13
|
-
i18n (>=
|
14
|
-
minitest (
|
15
|
-
tzinfo (~>
|
16
|
-
zeitwerk (~> 2.
|
17
|
-
concurrent-ruby (1.1.
|
18
|
-
diff-lcs (1.
|
19
|
-
i18n (1.
|
13
|
+
i18n (>= 1.6, < 2)
|
14
|
+
minitest (>= 5.1)
|
15
|
+
tzinfo (~> 2.0)
|
16
|
+
zeitwerk (~> 2.3)
|
17
|
+
concurrent-ruby (1.1.10)
|
18
|
+
diff-lcs (1.5.0)
|
19
|
+
i18n (1.10.0)
|
20
20
|
concurrent-ruby (~> 1.0)
|
21
|
-
mini_portile2 (2.
|
22
|
-
minitest (5.
|
23
|
-
nokogiri (1.
|
24
|
-
mini_portile2 (~> 2.
|
21
|
+
mini_portile2 (2.8.0)
|
22
|
+
minitest (5.15.0)
|
23
|
+
nokogiri (1.13.3)
|
24
|
+
mini_portile2 (~> 2.8.0)
|
25
|
+
racc (~> 1.4)
|
26
|
+
racc (1.6.0)
|
25
27
|
rake (12.3.3)
|
26
|
-
rspec (3.
|
27
|
-
rspec-core (~> 3.
|
28
|
-
rspec-expectations (~> 3.
|
29
|
-
rspec-mocks (~> 3.
|
30
|
-
rspec-core (3.
|
31
|
-
rspec-support (~> 3.
|
32
|
-
rspec-expectations (3.
|
28
|
+
rspec (3.11.0)
|
29
|
+
rspec-core (~> 3.11.0)
|
30
|
+
rspec-expectations (~> 3.11.0)
|
31
|
+
rspec-mocks (~> 3.11.0)
|
32
|
+
rspec-core (3.11.0)
|
33
|
+
rspec-support (~> 3.11.0)
|
34
|
+
rspec-expectations (3.11.0)
|
33
35
|
diff-lcs (>= 1.2.0, < 2.0)
|
34
|
-
rspec-support (~> 3.
|
35
|
-
rspec-mocks (3.
|
36
|
+
rspec-support (~> 3.11.0)
|
37
|
+
rspec-mocks (3.11.0)
|
36
38
|
diff-lcs (>= 1.2.0, < 2.0)
|
37
|
-
rspec-support (~> 3.
|
38
|
-
rspec-support (3.
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
zeitwerk (2.4.1)
|
39
|
+
rspec-support (~> 3.11.0)
|
40
|
+
rspec-support (3.11.0)
|
41
|
+
tzinfo (2.0.4)
|
42
|
+
concurrent-ruby (~> 1.0)
|
43
|
+
zeitwerk (2.5.4)
|
43
44
|
|
44
45
|
PLATFORMS
|
45
46
|
ruby
|
@@ -50,4 +51,4 @@ DEPENDENCIES
|
|
50
51
|
xml_data_extractor!
|
51
52
|
|
52
53
|
BUNDLED WITH
|
53
|
-
2.
|
54
|
+
2.2.6
|
data/README.md
CHANGED
@@ -171,7 +171,6 @@ schemas:
|
|
171
171
|
within: info/movie_data
|
172
172
|
title: original_title
|
173
173
|
actor: main_actor
|
174
|
-
|
175
174
|
```
|
176
175
|
```xml
|
177
176
|
<xml>
|
@@ -187,6 +186,42 @@ schemas:
|
|
187
186
|
{ movie: { title: "The Irishman", actor: "Robert De Niro" } }
|
188
187
|
```
|
189
188
|
|
189
|
+
#### unescape
|
190
|
+
|
191
|
+
This option is pretty usefull when you have embbed XML or HTML inside some tag, like CDATA elements, and you need to unescape them first in order to parse their content:
|
192
|
+
|
193
|
+
```yml
|
194
|
+
schemas:
|
195
|
+
movie:
|
196
|
+
unescape: response
|
197
|
+
title: response/original_title
|
198
|
+
actor: response/main_actor
|
199
|
+
|
200
|
+
```
|
201
|
+
|
202
|
+
```xml
|
203
|
+
<xml>
|
204
|
+
<response>
|
205
|
+
<original_title>1<original_title><main_actor>1<main_actor>
|
206
|
+
</response>
|
207
|
+
</xml>
|
208
|
+
```
|
209
|
+
|
210
|
+
This XML will be turned into this one during the parsing:
|
211
|
+
|
212
|
+
```xml
|
213
|
+
<xml>
|
214
|
+
<response>
|
215
|
+
<original_title>The Irishman</original_title>
|
216
|
+
<main_actor>Robert De Niro</main_actor>
|
217
|
+
</response>
|
218
|
+
</xml>
|
219
|
+
```
|
220
|
+
|
221
|
+
```ruby
|
222
|
+
{ movie: { title: "The Irishman", actor: "Robert De Niro" } }
|
223
|
+
```
|
224
|
+
|
190
225
|
#### array_of
|
191
226
|
|
192
227
|
Defines the path to a XML collection, which will be looped generating an array of hashes:
|
@@ -265,6 +300,295 @@ schemas:
|
|
265
300
|
}
|
266
301
|
```
|
267
302
|
|
303
|
+
### link
|
304
|
+
|
305
|
+
This command is useful when the XML contains references to other nodes, it works as a SQL JOIN. The path must be and expression containing the `<link>` identifier, which will be replaced by the value fetched from the `link:` command.
|
306
|
+
|
307
|
+
Example:
|
308
|
+
```yml
|
309
|
+
schemas:
|
310
|
+
bookings:
|
311
|
+
array_of: booking
|
312
|
+
date: booking_date
|
313
|
+
document: id
|
314
|
+
products:
|
315
|
+
array_of:
|
316
|
+
accomodation:
|
317
|
+
path: ../hotel[booking_id=<link>]/accomodation
|
318
|
+
link: id
|
319
|
+
```
|
320
|
+
```xml
|
321
|
+
<xml>
|
322
|
+
<booking>
|
323
|
+
<id>1</id>
|
324
|
+
<booking_date>2020-01-01</booking_date>
|
325
|
+
</booking>
|
326
|
+
<booking>
|
327
|
+
<id>2</id>
|
328
|
+
<booking_date>2020-01-02</booking_date>
|
329
|
+
</booking>
|
330
|
+
<hotel>
|
331
|
+
<booking_id>1</booking_id>
|
332
|
+
<accomodation>Standard</accomodation>
|
333
|
+
</hotel>
|
334
|
+
<hotel>
|
335
|
+
<booking_id>2</booking_id>
|
336
|
+
<accomodation>Premium</accomodation>
|
337
|
+
</hotel>
|
338
|
+
</xml>
|
339
|
+
```
|
340
|
+
```ruby
|
341
|
+
{
|
342
|
+
bookings: [
|
343
|
+
{
|
344
|
+
date: "2020-01-01",
|
345
|
+
document: "1"
|
346
|
+
products: [
|
347
|
+
{ accomodation: "Standard" }
|
348
|
+
]
|
349
|
+
},
|
350
|
+
{
|
351
|
+
date: "2020-01-02",
|
352
|
+
document: "2"
|
353
|
+
products: [
|
354
|
+
{ accomodation: "Premium" }
|
355
|
+
]
|
356
|
+
}
|
357
|
+
]
|
358
|
+
}
|
359
|
+
```
|
360
|
+
|
361
|
+
In this example if I didn't use the `link` to get only the hotel of each booking, it would have returned two accomodations for each booking and instead of extract a string with the accomodation it would extract an array with all the accomodations for each booking.
|
362
|
+
|
363
|
+
You can combine the `link` with `array_of` if you want search for a list of elements filtering by some field, just provide the `path` and the `link`:
|
364
|
+
|
365
|
+
```yml
|
366
|
+
schemas:
|
367
|
+
bookings:
|
368
|
+
array_of: booking
|
369
|
+
date: date
|
370
|
+
document: id
|
371
|
+
products:
|
372
|
+
array_of:
|
373
|
+
path: ../products[booking_id=<link>]
|
374
|
+
link: id
|
375
|
+
....
|
376
|
+
```
|
377
|
+
|
378
|
+
### uniq_by
|
379
|
+
|
380
|
+
Can only be used with **array_of**.
|
381
|
+
|
382
|
+
This functionality is useful when some XML nodes are duplicated and you want to extract data from the first occurrence only. It has a behavior similar to Ruby **uniq** method on arrays.
|
383
|
+
For each path generated from `array_of`, the value fetched using `uniq_by` will be checked against the generated collection and the path will be discarded if the value already exists.
|
384
|
+
|
385
|
+
```yml
|
386
|
+
schemas:
|
387
|
+
bookings:
|
388
|
+
array_of:
|
389
|
+
path: booking
|
390
|
+
uniq_by: id
|
391
|
+
date: bdate
|
392
|
+
document: id
|
393
|
+
```
|
394
|
+
```xml
|
395
|
+
<xml>
|
396
|
+
<booking>
|
397
|
+
<id>1</id>
|
398
|
+
<bdate>2020-01-01</bdate>
|
399
|
+
</booking>
|
400
|
+
<booking>
|
401
|
+
<id>1</id>
|
402
|
+
<bdate>2020-01-01</bdate>
|
403
|
+
</booking>
|
404
|
+
</xml>
|
405
|
+
```
|
406
|
+
```ruby
|
407
|
+
{
|
408
|
+
bookings: [
|
409
|
+
{
|
410
|
+
date: "2020-01-01",
|
411
|
+
document: "1"
|
412
|
+
}
|
413
|
+
]
|
414
|
+
}
|
415
|
+
```
|
416
|
+
|
417
|
+
In this example if we don't use the tag `uniq_by` there would be extracted two elements with the same data, like:
|
418
|
+
|
419
|
+
```ruby
|
420
|
+
{
|
421
|
+
bookings: [
|
422
|
+
{
|
423
|
+
date: "2020-01-01",
|
424
|
+
document: "1"
|
425
|
+
},
|
426
|
+
{
|
427
|
+
date: "2020-01-01",
|
428
|
+
document: "1"
|
429
|
+
}
|
430
|
+
]
|
431
|
+
}
|
432
|
+
```
|
433
|
+
|
434
|
+
### array_presence: first_only
|
435
|
+
|
436
|
+
The field that contains this property will be only added to the first item of the array.
|
437
|
+
|
438
|
+
Can only be used in fields that belong to a node of `array_of`.
|
439
|
+
|
440
|
+
```yml
|
441
|
+
passengers:
|
442
|
+
array_of: bookings/booking/passengers/passenger
|
443
|
+
id:
|
444
|
+
path: document
|
445
|
+
modifier: to_s
|
446
|
+
name:
|
447
|
+
attr: [FirstName, LastName]
|
448
|
+
modifier:
|
449
|
+
- name: join
|
450
|
+
params: [" "]
|
451
|
+
rav_tax:
|
452
|
+
array_presence: first_only
|
453
|
+
path: ../rav
|
454
|
+
modifier: to_f
|
455
|
+
```
|
456
|
+
```xml
|
457
|
+
<bookings>
|
458
|
+
<booking>
|
459
|
+
<rav>150<rav>
|
460
|
+
<passengers>
|
461
|
+
<passenger>
|
462
|
+
<document>109.111.019-79</document>
|
463
|
+
<FirstName>Marcelo</FirstName>
|
464
|
+
<LastName>Lauxen</LastName>
|
465
|
+
</passenger>
|
466
|
+
<passenger>
|
467
|
+
<document>110.155.019-78</document>
|
468
|
+
<FirstName>Corona</FirstName>
|
469
|
+
<LastName>Virus</LastName>
|
470
|
+
</passenger>
|
471
|
+
</passengers>
|
472
|
+
</booking>
|
473
|
+
</bookings>
|
474
|
+
```
|
475
|
+
```ruby
|
476
|
+
{
|
477
|
+
bookings: [
|
478
|
+
{
|
479
|
+
passengers: [
|
480
|
+
{
|
481
|
+
id: "109.111.019-79",
|
482
|
+
name: "Marcelo Lauxen",
|
483
|
+
tax_rav: 150.00
|
484
|
+
},
|
485
|
+
{
|
486
|
+
id: "110.155.019-78",
|
487
|
+
name: "Corona Virus"
|
488
|
+
}
|
489
|
+
]
|
490
|
+
}
|
491
|
+
]
|
492
|
+
}
|
493
|
+
```
|
494
|
+
|
495
|
+
In this example the field `tax_rav` was only included on the first passenger because this field has the `array_presence: first_only` property.
|
496
|
+
|
497
|
+
### in_parent
|
498
|
+
|
499
|
+
This option allows you to navigate to a parent node of the current node.
|
500
|
+
|
501
|
+
```yml
|
502
|
+
passengers:
|
503
|
+
array_of: bookings/booking/passengers/passenger
|
504
|
+
id:
|
505
|
+
path: document
|
506
|
+
modifier: to_s
|
507
|
+
bookings_id:
|
508
|
+
in_parent: bookings
|
509
|
+
path: id
|
510
|
+
```
|
511
|
+
```xml
|
512
|
+
<bookings>
|
513
|
+
<bookings_id>8888</bookings_id>
|
514
|
+
<booking>
|
515
|
+
<passengers>
|
516
|
+
<passenger>
|
517
|
+
<document>109.111.019-79</document>
|
518
|
+
</passenger>
|
519
|
+
<passenger>
|
520
|
+
<document>110.155.019-78</document>
|
521
|
+
</passenger>
|
522
|
+
</passengers>
|
523
|
+
</booking>
|
524
|
+
</bookings>
|
525
|
+
```
|
526
|
+
```ruby
|
527
|
+
{
|
528
|
+
bookings: [
|
529
|
+
{
|
530
|
+
passengers: [
|
531
|
+
{
|
532
|
+
id: "109.111.019-79",
|
533
|
+
bookings_id: 8888
|
534
|
+
},
|
535
|
+
{
|
536
|
+
id: "110.155.019-78",
|
537
|
+
bookings_id: 8888
|
538
|
+
}
|
539
|
+
]
|
540
|
+
}
|
541
|
+
]
|
542
|
+
}
|
543
|
+
```
|
544
|
+
|
545
|
+
In this example the value of `bookings_id` will be extracted starting at the node provided in `in_parent` instead of the current node. It's possible to navigate to a parent node with `../` too (xpath provides this functionality), but using `in_parent` you just need to provide the name of the parent node, it will navigate up until the parent node is found, no matter how many levels.
|
546
|
+
|
547
|
+
### keep_if
|
548
|
+
|
549
|
+
This option allows you to keep the part of the block of the hash in the final result only if the condition matches.
|
550
|
+
|
551
|
+
```yml
|
552
|
+
schemas:
|
553
|
+
dummy:
|
554
|
+
within: data
|
555
|
+
description: additional_desc
|
556
|
+
exchange: currency_info/value
|
557
|
+
price: price
|
558
|
+
payment:
|
559
|
+
type: payment_info/method
|
560
|
+
value: payment_info/price
|
561
|
+
keep_if: "'type' == 'invoice'"
|
562
|
+
```
|
563
|
+
```xml
|
564
|
+
<data>
|
565
|
+
<additional_desc>Keep walking</additional_desc>
|
566
|
+
<currency_info kind="USD">
|
567
|
+
<value>4.15</value>
|
568
|
+
</currency_info>
|
569
|
+
<price>55.09</price>
|
570
|
+
<payment_info>
|
571
|
+
<method>card</method>
|
572
|
+
<price>55.48</price>
|
573
|
+
<payment>
|
574
|
+
<installments>2</installments>
|
575
|
+
<card_number>333</card_number>
|
576
|
+
</payment>
|
577
|
+
</payment>
|
578
|
+
<data>
|
579
|
+
```
|
580
|
+
```ruby
|
581
|
+
{
|
582
|
+
dummy: {
|
583
|
+
description: "Keep walking",
|
584
|
+
exchange: "4.15",
|
585
|
+
price: "55.09"
|
586
|
+
}
|
587
|
+
}
|
588
|
+
```
|
589
|
+
|
590
|
+
In this example the condition didn't match since the payment method was `card` instead of `invoice` and then the extracted payment hash was removed from the final result.
|
591
|
+
|
268
592
|
### Formatting:
|
269
593
|
|
270
594
|
#### fixed
|
@@ -344,7 +668,7 @@ schemas:
|
|
344
668
|
path: [firstname, lastname]
|
345
669
|
modifier:
|
346
670
|
- name: join
|
347
|
-
params: [" "]
|
671
|
+
params: [" "]
|
348
672
|
- downcase
|
349
673
|
```
|
350
674
|
```xml
|
data/bin/rspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
#
|
5
|
+
# This file was generated by Bundler.
|
6
|
+
#
|
7
|
+
# The application 'rspec' is installed as part of a gem, and
|
8
|
+
# this file is here to facilitate running it.
|
9
|
+
#
|
10
|
+
|
11
|
+
require "pathname"
|
12
|
+
ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
|
13
|
+
Pathname.new(__FILE__).realpath)
|
14
|
+
|
15
|
+
bundle_binstub = File.expand_path("../bundle", __FILE__)
|
16
|
+
|
17
|
+
if File.file?(bundle_binstub)
|
18
|
+
if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
|
19
|
+
load(bundle_binstub)
|
20
|
+
else
|
21
|
+
abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
|
22
|
+
Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
require "rubygems"
|
27
|
+
require "bundler/setup"
|
28
|
+
|
29
|
+
load Gem.bin_path("rspec-core", "rspec")
|
data/lib/src/extract/array_of.rb
CHANGED
data/lib/src/extract/base.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Extract
|
2
4
|
class Expression
|
3
5
|
def initialize(expression, hash)
|
@@ -6,9 +8,8 @@ module Extract
|
|
6
8
|
end
|
7
9
|
|
8
10
|
def evaluate
|
9
|
-
|
10
|
-
|
11
|
-
condition = expression.gsub(field_name, field_value.to_s)
|
11
|
+
keys = Regexp.union(hash.keys.map(&:to_s))
|
12
|
+
condition = expression.gsub(keys, hash.stringify_keys)
|
12
13
|
|
13
14
|
eval(condition)
|
14
15
|
end
|
@@ -1,6 +1,8 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Extract
|
2
4
|
class HashBuilder < Base
|
3
|
-
INTERNAL_FIELDS = %i[array_of keep_if within].freeze
|
5
|
+
INTERNAL_FIELDS = %i[array_of keep_if within unescape].freeze
|
4
6
|
|
5
7
|
def value(index = 0)
|
6
8
|
path, props = node.to_h.values_at(:path, :props)
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Extract
|
4
|
+
class Unescape < Base
|
5
|
+
def unescape!
|
6
|
+
unescape_tag = node.props[:unescape]
|
7
|
+
|
8
|
+
paths_to_unescape = extractor.paths_of(node.path, unescape_tag)
|
9
|
+
return if paths_to_unescape.empty?
|
10
|
+
|
11
|
+
paths_to_unescape.each { |path| extractor.unescape!(path) }
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require_relative "base"
|
2
4
|
require_relative "array_value"
|
3
5
|
require_relative "array_of"
|
@@ -6,6 +8,7 @@ require_relative "string_value"
|
|
6
8
|
require_relative "value_builder"
|
7
9
|
require_relative "within"
|
8
10
|
require_relative "expression"
|
11
|
+
require_relative "unescape"
|
9
12
|
|
10
13
|
module Extract
|
11
14
|
class ValueBuilder < Base
|
@@ -24,6 +27,9 @@ module Extract
|
|
24
27
|
|
25
28
|
def value_for_hash
|
26
29
|
props = node.props
|
30
|
+
|
31
|
+
Unescape.new(node, extractor).unescape! if props[:unescape]
|
32
|
+
|
27
33
|
fixed_value = props[:fixed]
|
28
34
|
return fixed_value if fixed_value
|
29
35
|
return ArrayOf.new(node, extractor).value if props[:array_of]
|
data/lib/src/extract/within.rb
CHANGED
data/lib/src/extractor.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require "cgi"
|
2
4
|
require "active_support/core_ext/string"
|
3
5
|
require_relative "format/formatter"
|
@@ -37,11 +39,11 @@ class PathBuilder < Struct.new(:base, :parent, :tag, keyword_init: true)
|
|
37
39
|
end
|
38
40
|
|
39
41
|
def matching_tags?(item, tag)
|
40
|
-
item.gsub(/\[\d
|
42
|
+
item.gsub(/\[\d+\]/, "") == tag
|
41
43
|
end
|
42
44
|
end
|
43
45
|
|
44
|
-
class NodeParamsExtractor < Struct.new(:node)
|
46
|
+
class NodeParamsExtractor < Struct.new(:node)
|
45
47
|
def extract
|
46
48
|
[node.path, *node.props.values_at(:in_parent, :path, :link, :attr)]
|
47
49
|
end
|
@@ -49,7 +51,7 @@ end
|
|
49
51
|
|
50
52
|
class NodeExtractor
|
51
53
|
def initialize(xml)
|
52
|
-
@xml = Nokogiri::XML(
|
54
|
+
@xml = Nokogiri::XML(xml)
|
53
55
|
@xml.remove_namespaces!
|
54
56
|
end
|
55
57
|
|
@@ -59,12 +61,18 @@ class NodeExtractor
|
|
59
61
|
nil
|
60
62
|
end
|
61
63
|
|
62
|
-
|
64
|
+
def unescape!(path)
|
65
|
+
node = extract(path)
|
66
|
+
return if node.blank?
|
63
67
|
|
64
|
-
|
65
|
-
|
68
|
+
first_node = node.first
|
69
|
+
return if first_node.elements.present?
|
70
|
+
|
71
|
+
first_node.children = Nokogiri::XML.fragment(first_node.content).children
|
66
72
|
end
|
67
73
|
|
74
|
+
private
|
75
|
+
|
68
76
|
attr_reader :xml
|
69
77
|
end
|
70
78
|
|
@@ -193,7 +201,11 @@ class Extractor
|
|
193
201
|
end
|
194
202
|
|
195
203
|
value = path_value(path, tag, attribute)
|
196
|
-
format_value(value, node.props)
|
204
|
+
format_value(value, node.props)
|
205
|
+
end
|
206
|
+
|
207
|
+
def unescape!(path)
|
208
|
+
node_extractor.unescape!(path)
|
197
209
|
end
|
198
210
|
|
199
211
|
def format_value(value, props)
|
data/lib/src/format/formatter.rb
CHANGED
data/lib/src/format/mapper.rb
CHANGED
data/lib/src/format/modifier.rb
CHANGED
data/lib/src/node.rb
CHANGED
data/xml_data_extractor.gemspec
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "xml_data_extractor"
|
3
|
-
spec.version = "0.
|
3
|
+
spec.version = "0.6.0"
|
4
4
|
spec.authors = ["Fernando Almeida"]
|
5
5
|
spec.email = ["fernandoprsbr@gmail.com"]
|
6
6
|
|
7
7
|
spec.summary = "Provides a simples DSL for extracting data from XML documents"
|
8
8
|
spec.homepage = "https://github.com/monde-sistemas/xml_data_extractor"
|
9
9
|
spec.license = "MIT"
|
10
|
-
spec.required_ruby_version = Gem::Requirement.new(">= 2.
|
10
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.5")
|
11
11
|
|
12
12
|
spec.metadata["homepage_uri"] = spec.homepage
|
13
13
|
spec.metadata["source_code_uri"] = spec.homepage
|
@@ -24,5 +24,6 @@ Gem::Specification.new do |spec|
|
|
24
24
|
|
25
25
|
spec.add_dependency "nokogiri", "~> 1.0"
|
26
26
|
spec.add_dependency "activesupport", "~> 6.0"
|
27
|
+
spec.add_development_dependency "rake", "~> 12.0"
|
27
28
|
spec.add_development_dependency "rspec", "~> 3.0"
|
28
29
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xml_data_extractor
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Fernando Almeida
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-04-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '6.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '12.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '12.0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: rspec
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -59,15 +73,20 @@ executables: []
|
|
59
73
|
extensions: []
|
60
74
|
extra_rdoc_files: []
|
61
75
|
files:
|
76
|
+
- ".editorconfig"
|
77
|
+
- ".github/dependabot.yml"
|
78
|
+
- ".github/workflows/ci.yml"
|
62
79
|
- ".gitignore"
|
80
|
+
- ".kodiak.toml"
|
63
81
|
- ".rspec"
|
64
|
-
- ".
|
82
|
+
- ".ruby-version"
|
65
83
|
- Gemfile
|
66
84
|
- Gemfile.lock
|
67
85
|
- LICENSE.txt
|
68
86
|
- README.md
|
69
87
|
- Rakefile
|
70
88
|
- bin/console
|
89
|
+
- bin/rspec
|
71
90
|
- bin/setup
|
72
91
|
- lib/src/extract/array_of.rb
|
73
92
|
- lib/src/extract/array_value.rb
|
@@ -75,6 +94,7 @@ files:
|
|
75
94
|
- lib/src/extract/expression.rb
|
76
95
|
- lib/src/extract/hash_builder.rb
|
77
96
|
- lib/src/extract/string_value.rb
|
97
|
+
- lib/src/extract/unescape.rb
|
78
98
|
- lib/src/extract/value_builder.rb
|
79
99
|
- lib/src/extract/within.rb
|
80
100
|
- lib/src/extractor.rb
|
@@ -99,14 +119,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
99
119
|
requirements:
|
100
120
|
- - ">="
|
101
121
|
- !ruby/object:Gem::Version
|
102
|
-
version: 2.
|
122
|
+
version: '2.5'
|
103
123
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
104
124
|
requirements:
|
105
125
|
- - ">="
|
106
126
|
- !ruby/object:Gem::Version
|
107
127
|
version: '0'
|
108
128
|
requirements: []
|
109
|
-
rubygems_version: 3.
|
129
|
+
rubygems_version: 3.2.25
|
110
130
|
signing_key:
|
111
131
|
specification_version: 4
|
112
132
|
summary: Provides a simples DSL for extracting data from XML documents
|