embulk 0.8.14-java → 0.8.15-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/build.gradle +3 -3
- data/embulk-core/src/main/java/org/embulk/EmbulkEmbed.java +1 -1
- data/embulk-core/src/main/java/org/embulk/exec/BulkLoader.java +35 -7
- data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +2 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecutionResult.java +8 -1
- data/embulk-core/src/main/java/org/embulk/exec/SkipTransactionException.java +23 -0
- data/embulk-docs/src/built-in.rst +299 -62
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.8.15.rst +17 -0
- data/embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java +374 -1
- data/embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java +872 -0
- data/embulk-test/build.gradle +6 -0
- data/embulk-test/src/main/java/org/embulk/test/EmbulkTests.java +75 -0
- data/embulk-test/src/main/java/org/embulk/test/TestingBulkLoader.java +124 -0
- data/embulk-test/src/main/java/org/embulk/test/TestingEmbulk.java +223 -0
- data/lib/embulk/version.rb +1 -1
- data/settings.gradle +1 -0
- metadata +11 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c47e67c0b46d1a8dcd84337dee0d3479fd37f45f
|
4
|
+
data.tar.gz: edcd6bafc9095768db0798dcae9c0d53532f5054
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1946063a900859720ba33919583f1bcfabb543af948b02260f3f91664445c2754aa2014e5b9c3c41cddd800ecb238b2450c39a0520d825d4f03d9d693f7d04f3
|
7
|
+
data.tar.gz: 8348fd5001511a3b718668eccb8b2686fd6cc2cf9635b38f2c941f79b20aa185c83bc01bb535559782895ead8a52b5cdb823c341a177afed7af1dff568e57001
|
data/README.md
CHANGED
data/build.gradle
CHANGED
@@ -11,12 +11,12 @@ apply plugin: "com.github.jruby-gradle.jar"
|
|
11
11
|
apply plugin: 'com.jfrog.bintray'
|
12
12
|
apply plugin: 'com.github.johnrengelman.shadow'
|
13
13
|
|
14
|
-
def java_projects = [project(":embulk-core"), project(":embulk-standards"), project(":embulk-cli")]
|
15
|
-
def release_projects = [project(":embulk-core"), project(":embulk-standards")]
|
14
|
+
def java_projects = [project(":embulk-core"), project(":embulk-standards"), project(":embulk-cli"), project(":embulk-test")]
|
15
|
+
def release_projects = [project(":embulk-core"), project(":embulk-standards"), project(":embulk-test")]
|
16
16
|
|
17
17
|
allprojects {
|
18
18
|
group = 'org.embulk'
|
19
|
-
version = '0.8.
|
19
|
+
version = '0.8.15'
|
20
20
|
|
21
21
|
ext {
|
22
22
|
jrubyVersion = '9.1.5.0'
|
@@ -126,7 +126,7 @@ public class EmbulkEmbed
|
|
126
126
|
{
|
127
127
|
this.injector = injector;
|
128
128
|
injector.getInstance(org.slf4j.ILoggerFactory.class);
|
129
|
-
this.bulkLoader =
|
129
|
+
this.bulkLoader = injector.getInstance(BulkLoader.class);
|
130
130
|
this.guessExecutor = injector.getInstance(GuessExecutor.class);
|
131
131
|
this.previewExecutor = injector.getInstance(PreviewExecutor.class);
|
132
132
|
}
|
@@ -62,7 +62,7 @@ public class BulkLoader
|
|
62
62
|
this.injector = injector;
|
63
63
|
}
|
64
64
|
|
65
|
-
|
65
|
+
protected static class LoaderState
|
66
66
|
implements ProcessState
|
67
67
|
{
|
68
68
|
private final Logger logger;
|
@@ -343,7 +343,17 @@ public class BulkLoader
|
|
343
343
|
ignoredExceptions.add(ex);
|
344
344
|
}
|
345
345
|
|
346
|
-
return new ExecutionResult(configDiff, ignoredExceptions.build());
|
346
|
+
return new ExecutionResult(configDiff, false, ignoredExceptions.build());
|
347
|
+
}
|
348
|
+
|
349
|
+
public ExecutionResult buildExecuteResultOfSkippedExecution(ConfigDiff configDiff)
|
350
|
+
{
|
351
|
+
ImmutableList.Builder<Throwable> ignoredExceptions = ImmutableList.builder();
|
352
|
+
for (Throwable e : getExceptions()) {
|
353
|
+
ignoredExceptions.add(e);
|
354
|
+
}
|
355
|
+
|
356
|
+
return new ExecutionResult(configDiff, true, ignoredExceptions.build());
|
347
357
|
}
|
348
358
|
|
349
359
|
public ResumeState buildResumeState(ExecSession exec)
|
@@ -364,6 +374,11 @@ public class BulkLoader
|
|
364
374
|
}
|
365
375
|
}
|
366
376
|
|
377
|
+
protected LoaderState newLoaderState(Logger logger, ProcessPluginSet plugins)
|
378
|
+
{
|
379
|
+
return new LoaderState(logger, plugins);
|
380
|
+
}
|
381
|
+
|
367
382
|
public ExecutionResult run(ExecSession exec, final ConfigSource config)
|
368
383
|
{
|
369
384
|
try {
|
@@ -418,7 +433,7 @@ public class BulkLoader
|
|
418
433
|
}
|
419
434
|
}
|
420
435
|
|
421
|
-
|
436
|
+
protected static class ProcessPluginSet
|
422
437
|
{
|
423
438
|
private final PluginType inputPluginType;
|
424
439
|
private final PluginType outputPluginType;
|
@@ -507,7 +522,7 @@ public class BulkLoader
|
|
507
522
|
final ExecutorPlugin exec = newExecutorPlugin(task);
|
508
523
|
final ProcessPluginSet plugins = new ProcessPluginSet(task);
|
509
524
|
|
510
|
-
final LoaderState state =
|
525
|
+
final LoaderState state = newLoaderState(Exec.getLogger(BulkLoader.class), plugins);
|
511
526
|
state.setTransactionStage(TransactionStage.INPUT_BEGIN);
|
512
527
|
try {
|
513
528
|
ConfigDiff inputConfigDiff = plugins.getInputPlugin().transaction(task.getInputConfig(), new InputPlugin.Control() {
|
@@ -565,7 +580,11 @@ public class BulkLoader
|
|
565
580
|
return state.buildExecuteResult();
|
566
581
|
|
567
582
|
} catch (Throwable ex) {
|
568
|
-
if (
|
583
|
+
if (isSkippedTransaction(ex)) {
|
584
|
+
ConfigDiff configDiff = ((SkipTransactionException) ex).getConfigDiff();
|
585
|
+
return state.buildExecuteResultOfSkippedExecution(configDiff);
|
586
|
+
}
|
587
|
+
else if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
|
569
588
|
// ignore the exception
|
570
589
|
return state.buildExecuteResultWithWarningException(ex);
|
571
590
|
}
|
@@ -580,7 +599,7 @@ public class BulkLoader
|
|
580
599
|
final ExecutorPlugin exec = newExecutorPlugin(task);
|
581
600
|
final ProcessPluginSet plugins = new ProcessPluginSet(task);
|
582
601
|
|
583
|
-
final LoaderState state =
|
602
|
+
final LoaderState state = newLoaderState(Exec.getLogger(BulkLoader.class), plugins);
|
584
603
|
state.setTransactionStage(TransactionStage.INPUT_BEGIN);
|
585
604
|
try {
|
586
605
|
ConfigDiff inputConfigDiff = plugins.getInputPlugin().resume(resume.getInputTaskSource(), resume.getInputSchema(), resume.getInputTaskReports().size(), new InputPlugin.Control() {
|
@@ -642,7 +661,11 @@ public class BulkLoader
|
|
642
661
|
return state.buildExecuteResult();
|
643
662
|
|
644
663
|
} catch (Throwable ex) {
|
645
|
-
if (
|
664
|
+
if (isSkippedTransaction(ex)) {
|
665
|
+
ConfigDiff configDiff = ((SkipTransactionException) ex).getConfigDiff();
|
666
|
+
return state.buildExecuteResultOfSkippedExecution(configDiff);
|
667
|
+
}
|
668
|
+
else if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
|
646
669
|
// ignore the exception
|
647
670
|
return state.buildExecuteResultWithWarningException(ex);
|
648
671
|
}
|
@@ -650,6 +673,11 @@ public class BulkLoader
|
|
650
673
|
}
|
651
674
|
}
|
652
675
|
|
676
|
+
private static boolean isSkippedTransaction(Throwable ex)
|
677
|
+
{
|
678
|
+
return ex instanceof SkipTransactionException;
|
679
|
+
}
|
680
|
+
|
653
681
|
private static void restoreResumedTaskReports(ResumeState resume, LoaderState state)
|
654
682
|
{
|
655
683
|
int inputTaskCount = resume.getInputTaskReports().size();
|
@@ -26,6 +26,8 @@ public class ExecModule
|
|
26
26
|
{
|
27
27
|
Preconditions.checkNotNull(binder, "binder is null.");
|
28
28
|
|
29
|
+
binder.bind(BulkLoader.class);
|
30
|
+
|
29
31
|
binder.bind(ILoggerFactory.class).toProvider(LoggerProvider.class).in(Scopes.SINGLETON);
|
30
32
|
binder.bind(ModelManager.class).in(Scopes.SINGLETON);
|
31
33
|
binder.bind(BufferAllocator.class).to(PooledBufferAllocator.class).in(Scopes.SINGLETON);
|
@@ -6,11 +6,13 @@ import org.embulk.config.ConfigDiff;
|
|
6
6
|
public class ExecutionResult
|
7
7
|
{
|
8
8
|
private final ConfigDiff configDiff;
|
9
|
+
private final boolean skipped;
|
9
10
|
private final List<Throwable> ignoredExceptions;
|
10
11
|
|
11
|
-
public ExecutionResult(ConfigDiff configDiff, List<Throwable> ignoredExceptions)
|
12
|
+
public ExecutionResult(ConfigDiff configDiff, boolean skipped, List<Throwable> ignoredExceptions)
|
12
13
|
{
|
13
14
|
this.configDiff = configDiff;
|
15
|
+
this.skipped = skipped;
|
14
16
|
this.ignoredExceptions = ignoredExceptions;
|
15
17
|
}
|
16
18
|
|
@@ -19,6 +21,11 @@ public class ExecutionResult
|
|
19
21
|
return configDiff;
|
20
22
|
}
|
21
23
|
|
24
|
+
public boolean isSkipped()
|
25
|
+
{
|
26
|
+
return skipped;
|
27
|
+
}
|
28
|
+
|
22
29
|
public List<Throwable> getIgnoredExceptions()
|
23
30
|
{
|
24
31
|
return ignoredExceptions;
|
@@ -0,0 +1,23 @@
|
|
1
|
+
package org.embulk.exec;
|
2
|
+
|
3
|
+
import org.embulk.config.ConfigDiff;
|
4
|
+
|
5
|
+
// Input/output plugins might need to stop Embulk before the transaction starts by depending
|
6
|
+
// on the conditions of input/output data sources/destinations. They can throw this exception
|
7
|
+
// if they want to do that. Embulk handles it and then stops the transaction.
|
8
|
+
public class SkipTransactionException
|
9
|
+
extends RuntimeException
|
10
|
+
{
|
11
|
+
private final ConfigDiff configDiff;
|
12
|
+
|
13
|
+
public SkipTransactionException(ConfigDiff configDiff)
|
14
|
+
{
|
15
|
+
super();
|
16
|
+
this.configDiff = configDiff;
|
17
|
+
}
|
18
|
+
|
19
|
+
public ConfigDiff getConfigDiff()
|
20
|
+
{
|
21
|
+
return configDiff;
|
22
|
+
}
|
23
|
+
}
|
@@ -1,12 +1,12 @@
|
|
1
1
|
Configuration
|
2
|
-
|
2
|
+
==============
|
3
3
|
|
4
4
|
.. contents::
|
5
5
|
:local:
|
6
6
|
:depth: 2
|
7
7
|
|
8
8
|
Embulk configuration file format
|
9
|
-
|
9
|
+
---------------------------------
|
10
10
|
|
11
11
|
Embulk uses a YAML file to define a bulk data loading. Here is an example of the file:
|
12
12
|
|
@@ -60,7 +60,7 @@ In many cases, what you need to write is **in:**, **out**: and **formatter** sec
|
|
60
60
|
|
61
61
|
|
62
62
|
Using variables
|
63
|
-
|
63
|
+
~~~~~~~~~~~~~~~~
|
64
64
|
|
65
65
|
You can embed environment variables in configuration file using `Liquid template engine <http://liquidmarkup.org/>`_ (This is experimental feature. Behavior might change or be removed in future releases).
|
66
66
|
|
@@ -89,7 +89,7 @@ Environment variables are set to ``env`` variable.
|
|
89
89
|
|
90
90
|
|
91
91
|
Including files
|
92
|
-
|
92
|
+
~~~~~~~~~~~~~~~~
|
93
93
|
|
94
94
|
Configuration file can include another configuration file. To use it, configuration file name must end with ``.yml.liquid``.
|
95
95
|
|
@@ -121,12 +121,12 @@ With above 2 files, actual configuration file will be:
|
|
121
121
|
|
122
122
|
|
123
123
|
Local file input plugin
|
124
|
-
|
124
|
+
------------------------
|
125
125
|
|
126
126
|
The ``file`` input plugin reads files from local file system.
|
127
127
|
|
128
128
|
Options
|
129
|
-
|
129
|
+
~~~~~~~~
|
130
130
|
|
131
131
|
+----------------+----------+------------------------------------------------+-----------+
|
132
132
|
| name | type | description | required? |
|
@@ -168,7 +168,7 @@ For example, if you set ``last_path: /path/to/files/sample_02.csv``, Embulk read
|
|
168
168
|
|-- sample_04.csv -> read
|
169
169
|
|
170
170
|
Example
|
171
|
-
|
171
|
+
~~~~~~~~
|
172
172
|
|
173
173
|
.. code-block:: yaml
|
174
174
|
|
@@ -187,7 +187,7 @@ CSV parser plugin
|
|
187
187
|
The ``csv`` parser plugin parses CSV and TSV files.
|
188
188
|
|
189
189
|
Options
|
190
|
-
|
190
|
+
~~~~~~~~
|
191
191
|
|
192
192
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
|
193
193
|
| name | type | description | required? |
|
@@ -271,7 +271,7 @@ The ``null_string`` option converts certain values to NULL. Values will be conve
|
|
271
271
|
You can use ``guess`` to automatically generate the column settings. See also `Quick Start <https://github.com/embulk/embulk#quick-start>`_.
|
272
272
|
|
273
273
|
Example
|
274
|
-
|
274
|
+
~~~~~~~~
|
275
275
|
|
276
276
|
.. code-block:: yaml
|
277
277
|
|
@@ -295,8 +295,30 @@ Example
|
|
295
295
|
- {name: comment, type: string}
|
296
296
|
|
297
297
|
|
298
|
+
.. note::
|
299
|
+
|
300
|
+
CSV parser supports ``format: '%s'`` to parse UNIX timestamp in seconds (e.g. 1470148959) as timestamp.
|
301
|
+
|
302
|
+
However, CSV parser itself can't parse UNIX timestamp in millisecond (e.g. 1470148959542) as timestamp. You can still parse the column as ``long`` type first, then apply `timestamp_format <https://github.com/sonots/embulk-filter-timestamp_format>`_ filter plugin to convert long to timestamp. Here is an example:
|
303
|
+
|
304
|
+
.. code-block:: yaml
|
305
|
+
|
306
|
+
in:
|
307
|
+
type: file
|
308
|
+
path_prefix: /my_csv_files
|
309
|
+
parser:
|
310
|
+
...
|
311
|
+
columns:
|
312
|
+
- {name: timestamp_in_seconds, type: timestamp, format: '%s'}
|
313
|
+
- {name: timestamp_in_millis, type: long}
|
314
|
+
filters:
|
315
|
+
- type: timestamp_format
|
316
|
+
columns:
|
317
|
+
- {name: timestamp_in_millis, from_unit: ms}
|
318
|
+
|
319
|
+
|
298
320
|
JSON parser plugin
|
299
|
-
|
321
|
+
-------------------
|
300
322
|
|
301
323
|
The ``json`` parser plugin parses a JSON file that contains a sequence of JSON objects. Example:
|
302
324
|
|
@@ -310,7 +332,7 @@ The ``json`` parser plugin parses a JSON file that contains a sequence of JSON o
|
|
310
332
|
``json`` parser plugin outputs a single record named "record" (type is json).
|
311
333
|
|
312
334
|
Options
|
313
|
-
|
335
|
+
~~~~~~~~
|
314
336
|
|
315
337
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
316
338
|
| name | type | description | required? |
|
@@ -320,7 +342,7 @@ Options
|
|
320
342
|
|
321
343
|
|
322
344
|
Example
|
323
|
-
|
345
|
+
~~~~~~~~
|
324
346
|
|
325
347
|
.. code-block:: yaml
|
326
348
|
|
@@ -329,17 +351,17 @@ Example
|
|
329
351
|
type: json
|
330
352
|
|
331
353
|
Gzip decoder plugin
|
332
|
-
|
354
|
+
--------------------
|
333
355
|
|
334
356
|
The ``gzip`` decoder plugin decompresses gzip files before input plugins read them.
|
335
357
|
|
336
358
|
Options
|
337
|
-
|
359
|
+
~~~~~~~~
|
338
360
|
|
339
361
|
This plugin doesn't have any options.
|
340
362
|
|
341
363
|
Example
|
342
|
-
|
364
|
+
~~~~~~~~
|
343
365
|
|
344
366
|
.. code-block:: yaml
|
345
367
|
|
@@ -350,17 +372,17 @@ Example
|
|
350
372
|
|
351
373
|
|
352
374
|
BZip2 decoder plugin
|
353
|
-
|
375
|
+
---------------------
|
354
376
|
|
355
377
|
The ``bzip2`` decoder plugin decompresses bzip2 files before input plugins read them.
|
356
378
|
|
357
379
|
Options
|
358
|
-
|
380
|
+
~~~~~~~~
|
359
381
|
|
360
382
|
This plugin doesn't have any options.
|
361
383
|
|
362
384
|
Example
|
363
|
-
|
385
|
+
~~~~~~~~
|
364
386
|
|
365
387
|
.. code-block:: yaml
|
366
388
|
|
@@ -371,12 +393,12 @@ Example
|
|
371
393
|
|
372
394
|
|
373
395
|
File output plugin
|
374
|
-
|
396
|
+
-------------------
|
375
397
|
|
376
398
|
The ``file`` output plugin writes records to local file system.
|
377
399
|
|
378
400
|
Options
|
379
|
-
|
401
|
+
~~~~~~~~
|
380
402
|
|
381
403
|
+--------------------+----------+---------------------------------------------------+----------------------------+
|
382
404
|
| name | type | description | required? |
|
@@ -404,7 +426,7 @@ For example, if you set ``path_prefix: /path/to/output/sample_``, ``sequence_for
|
|
404
426
|
``sequence_format`` formats task index and sequence number in a task.
|
405
427
|
|
406
428
|
Example
|
407
|
-
|
429
|
+
~~~~~~~~
|
408
430
|
|
409
431
|
.. code-block:: yaml
|
410
432
|
|
@@ -416,12 +438,12 @@ Example
|
|
416
438
|
...
|
417
439
|
|
418
440
|
CSV formatter plugin
|
419
|
-
|
441
|
+
---------------------
|
420
442
|
|
421
443
|
The ``csv`` formatter plugin formats records using CSV or TSV format.
|
422
444
|
|
423
445
|
Options
|
424
|
-
|
446
|
+
~~~~~~~~
|
425
447
|
|
426
448
|
+----------------------+---------+-------------------------------------------------------------------------------------------------------+-------------------------------+
|
427
449
|
| name | type | description | required? |
|
@@ -474,7 +496,7 @@ The ``column_options`` option is a map whose keys are name of columns, and value
|
|
474
496
|
+----------------------+---------+-------------------------------------------------------------------------------------------------------+-----------------------------------------+
|
475
497
|
|
476
498
|
Example
|
477
|
-
|
499
|
+
~~~~~~~~
|
478
500
|
|
479
501
|
.. code-block:: yaml
|
480
502
|
|
@@ -496,12 +518,12 @@ Example
|
|
496
518
|
mycol2: {format: '%Y-%m-%d %H:%M:%S', timezone: 'America/Los_Angeles'}
|
497
519
|
|
498
520
|
Gzip encoder plugin
|
499
|
-
|
521
|
+
--------------------
|
500
522
|
|
501
523
|
The ``gzip`` encoder plugin compresses output files using gzip.
|
502
524
|
|
503
525
|
Options
|
504
|
-
|
526
|
+
~~~~~~~~
|
505
527
|
|
506
528
|
+---------+----------+----------------------------------------------------------------------+--------------------+
|
507
529
|
| name | type | description | required? |
|
@@ -510,7 +532,7 @@ Options
|
|
510
532
|
+---------+----------+----------------------------------------------------------------------+--------------------+
|
511
533
|
|
512
534
|
Example
|
513
|
-
|
535
|
+
~~~~~~~~
|
514
536
|
|
515
537
|
.. code-block:: yaml
|
516
538
|
|
@@ -521,73 +543,285 @@ Example
|
|
521
543
|
level: 1
|
522
544
|
|
523
545
|
|
524
|
-
|
525
|
-
|
546
|
+
BZip2 encoder plugin
|
547
|
+
---------------------
|
526
548
|
|
527
|
-
The ``
|
549
|
+
The ``bzip2`` encoder plugin compresses output files using bzip2.
|
528
550
|
|
529
551
|
Options
|
530
|
-
|
552
|
+
~~~~~~~~
|
531
553
|
|
532
554
|
+---------+----------+----------------------------------------------------------------------+--------------------+
|
533
555
|
| name | type | description | required? |
|
534
556
|
+=========+==========+======================================================================+====================+
|
535
|
-
| level | integer | Compression level. From
|
557
|
+
| level | integer | Compression level. From 1 to 9 (best compression). | ``9`` by default |
|
536
558
|
+---------+----------+----------------------------------------------------------------------+--------------------+
|
537
559
|
|
538
560
|
Example
|
539
|
-
|
561
|
+
~~~~~~~~
|
540
562
|
|
541
563
|
.. code-block:: yaml
|
542
564
|
|
543
565
|
out:
|
544
566
|
...
|
545
567
|
encoders:
|
546
|
-
- type:
|
547
|
-
level:
|
568
|
+
- type: bzip2
|
569
|
+
level: 6
|
548
570
|
|
549
|
-
BZip2 encoder plugin
|
550
|
-
------------------
|
551
571
|
|
552
|
-
|
572
|
+
Rename filter plugin
|
573
|
+
---------------------
|
574
|
+
|
575
|
+
The ``rename`` filter plugin changes column names. This plugin has no impact on performance.
|
553
576
|
|
554
577
|
Options
|
555
|
-
|
578
|
+
~~~~~~~~
|
556
579
|
|
557
580
|
+---------+----------+----------------------------------------------------------------------+--------------------+
|
558
581
|
| name | type | description | required? |
|
559
582
|
+=========+==========+======================================================================+====================+
|
560
|
-
|
|
583
|
+
| rules | array | An array of rule-based renaming operations. (See below for rules.) | ``[]`` by default |
|
584
|
+
+---------+----------+----------------------------------------------------------------------+--------------------+
|
585
|
+
| columns | hash | A map whose keys are existing column names. values are new names. | ``{}`` by default |
|
561
586
|
+---------+----------+----------------------------------------------------------------------+--------------------+
|
562
587
|
|
588
|
+
Renaming rules
|
589
|
+
~~~~~~~~~~~~~~~
|
590
|
+
|
591
|
+
The ``rules`` is an array of rules as below applied top-down for all the columns.
|
592
|
+
|
593
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
594
|
+
| rule | description |
|
595
|
+
+=========================+========================================================================================+
|
596
|
+
| character\_types | Restrict characters by types. Replace restricted characteres. |
|
597
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
598
|
+
| first\_character\_types | Restrict the first character by types. Prefix or replace first restricted characters. |
|
599
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
600
|
+
| lower\_to\_upper | Convert lower-case alphabets to upper-case. |
|
601
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
602
|
+
| regex\_replace | Replace with a regular expressions. |
|
603
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
604
|
+
| truncate | Truncate. |
|
605
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
606
|
+
| upper\_to\_lower | Convert upper-case alphabets to lower-case |
|
607
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
608
|
+
| unique\_number\_suffix | Make column names unique in the schema. |
|
609
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
610
|
+
|
611
|
+
Renaming rule: character\_types
|
612
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
613
|
+
|
614
|
+
The rule ``character_types`` replaces restricted characters.
|
615
|
+
|
616
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
|
617
|
+
| option | description | required? |
|
618
|
+
+===================+============================================================================================================================================+====================+
|
619
|
+
| pass\_characteres | Characters to be allowed. | ``""`` by default |
|
620
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
|
621
|
+
| pass\_types | Sets of characters to be allowed. The array must consist of "a-z" (lower-case alphabets), "A-Z" (upper-case alphabets), or "0-9" (digits). | ``[]`` by default |
|
622
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
|
623
|
+
| replace | A character that disallowed characters are replaced with. It must consist of just 1 character. | ``"_"`` by default |
|
624
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
|
625
|
+
|
563
626
|
Example
|
564
|
-
|
627
|
+
""""""""
|
565
628
|
|
566
629
|
.. code-block:: yaml
|
567
630
|
|
568
|
-
|
631
|
+
# This configuration replaces characters into "_" except for "_", lower-case alphabets, and digits.
|
632
|
+
filters:
|
569
633
|
...
|
570
|
-
|
571
|
-
|
572
|
-
|
634
|
+
- type: rename
|
635
|
+
rules:
|
636
|
+
- rule: character_types
|
637
|
+
pass_characters: "_"
|
638
|
+
pass_types: [ "a-z", "0-9" ]
|
573
639
|
|
574
640
|
|
575
|
-
|
576
|
-
|
641
|
+
Renaming rule: first\_character\_types
|
642
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
577
643
|
|
578
|
-
The ``
|
644
|
+
The rule ``first_character_types`` prefixes or replaces a restricted character at the beginning.
|
579
645
|
|
580
|
-
|
581
|
-
|
646
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
|
647
|
+
| option | description | required? |
|
648
|
+
+===================+============================================================================================================================================+==============================================+
|
649
|
+
| pass\_characteres | Characters to be allowed. | ``""`` by default |
|
650
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
|
651
|
+
| pass\_types | Sets of characters to be allowed. The array must consist of "a-z" (lower-case alphabets), "A-Z" (upper-case alphabets), or "0-9" (digits). | ``[]`` by default |
|
652
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
|
653
|
+
| prefix | A character that a disallowed first character is replaced with. | one of ``prefix`` or ``replace`` is required |
|
654
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
|
655
|
+
| replace | A character that a disallowed first character is prefixed with. | one of ``prefix`` or ``replace`` is required |
|
656
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
|
582
657
|
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
658
|
+
Example
|
659
|
+
""""""""
|
660
|
+
|
661
|
+
.. code-block:: yaml
|
662
|
+
|
663
|
+
# This configuration prefixes a column name with "_" unless the name starts from "_" or a lower-case alphabet.
|
664
|
+
filters:
|
665
|
+
...
|
666
|
+
- type: rename
|
667
|
+
rules:
|
668
|
+
- rule: first_character_types
|
669
|
+
pass_characters: "_"
|
670
|
+
pass_types: [ "a-z" ]
|
671
|
+
prefix: "_"
|
672
|
+
|
673
|
+
Renaming rule: lower\_to\_upper
|
674
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
675
|
+
|
676
|
+
The rule ``lower_to_upper`` converts lower-case alphabets to upper-case.
|
677
|
+
|
678
|
+
Example
|
679
|
+
""""""""
|
680
|
+
|
681
|
+
.. code-block:: yaml
|
682
|
+
|
683
|
+
# This configuration converts all lower-case alphabets to upper-case.
|
684
|
+
filters:
|
685
|
+
...
|
686
|
+
- type: rename
|
687
|
+
rules:
|
688
|
+
- rule: lower_to_upper
|
689
|
+
|
690
|
+
|
691
|
+
Renaming rule: regex\_replace
|
692
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
693
|
+
|
694
|
+
The rule ``regex_replace`` replaces column names based on a regular expression.
|
695
|
+
|
696
|
+
+---------+--------------------------------------------------------------------------------------------------------------------------------------+-----------+
|
697
|
+
| option | description | required? |
|
698
|
+
+=========+======================================================================================================================================+===========+
|
699
|
+
| match | A `Java-style regular expression <https://docs.oracle.com/javase/tutorial/essential/regex/>`_ to which this string is to be matched. | required |
|
700
|
+
+---------+--------------------------------------------------------------------------------------------------------------------------------------+-----------+
|
701
|
+
| replace | A string to be substibuted for each match in Java-style. | required |
|
702
|
+
+---------+--------------------------------------------------------------------------------------------------------------------------------------+-----------+
|
703
|
+
|
704
|
+
Example
|
705
|
+
""""""""
|
706
|
+
|
707
|
+
.. code-block:: yaml
|
708
|
+
|
709
|
+
# This configuration replaces all patterns
|
710
|
+
filters:
|
711
|
+
...
|
712
|
+
- type: rename
|
713
|
+
rules:
|
714
|
+
- rule: regex_replace
|
715
|
+
match: "([0-9]+)_dollars"
|
716
|
+
replace: "USD$1"
|
717
|
+
|
718
|
+
|
719
|
+
Renaming rule: truncate
|
720
|
+
^^^^^^^^^^^^^^^^^^^^^^^^
|
721
|
+
|
722
|
+
The rule ``truncate`` truncates column names.
|
723
|
+
|
724
|
+
+------------+-----------------------------------------------------+--------------------+
|
725
|
+
| option | description | required? |
|
726
|
+
+============+=====================================================+====================+
|
727
|
+
| max_length | The length to which the column names are truncated. | ``128`` by default |
|
728
|
+
+------------+-----------------------------------------------------+--------------------+
|
729
|
+
|
730
|
+
Example
|
731
|
+
""""""""
|
732
|
+
|
733
|
+
.. code-block:: yaml
|
734
|
+
|
735
|
+
# This configuration drops all characters after the 20th character.
|
736
|
+
filters:
|
737
|
+
...
|
738
|
+
- type: rename
|
739
|
+
rules:
|
740
|
+
- rule: truncate
|
741
|
+
max_length: 20
|
742
|
+
|
743
|
+
Renaming rule: upper\_to\_lower
|
744
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
745
|
+
|
746
|
+
The rule ``upper_to_lower`` converts upper-case alphabets to lower-case.
|
747
|
+
|
748
|
+
Example
|
749
|
+
""""""""
|
750
|
+
|
751
|
+
.. code-block:: yaml
|
752
|
+
|
753
|
+
# This configuration converts all upper-case alphabets to lower-case.
|
754
|
+
filters:
|
755
|
+
...
|
756
|
+
- type: rename
|
757
|
+
rules:
|
758
|
+
- rule: upper_to_lower
|
759
|
+
|
760
|
+
Renaming rule: unique\_number\_suffix
|
761
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
762
|
+
|
763
|
+
The rule ``unique_number_suffix`` makes column names unique in the schema by suffixing numbers.
|
764
|
+
|
765
|
+
+------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
|
766
|
+
| option | description | required? |
|
767
|
+
+============+=============================================================================================================================+====================+
|
768
|
+
| delimiter | A delimiter character inserted before a suffix number. It must be just 1 non-digit character. | ``"_"`` by default |
|
769
|
+
+------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
|
770
|
+
| digits | An integer that specifies the number of zero-filled digits of a suffix number. The suffix number zero-filled to the digits. | optional |
|
771
|
+
+------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
|
772
|
+
| max_length | The length to which the column names are truncated. The column name is truncated before the suffix number. | optional |
|
773
|
+
+------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
|
774
|
+
| offset | An integer where the suffix number starts. The first duplicative column name is suffixed by (```offset``` + 1). | ``1`` by default |
|
775
|
+
+------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
|
776
|
+
|
777
|
+
.. hint::
|
778
|
+
The procedure to make column names unique is not very trivial. There are many feasible ways. This renaming rule works as follows:
|
779
|
+
|
780
|
+
Basic policies:
|
781
|
+
|
782
|
+
* Suffix numbers are counted per original column name.
|
783
|
+
* Column names are fixed from the first column to the last column.
|
784
|
+
|
785
|
+
Actual procedure applied from the first (leftmost) column to the last (rightmost) column:
|
786
|
+
|
787
|
+
1. Fix the column name as-is with truncating if the truncated name is not duplicated with left columns.
|
788
|
+
2. Suffix the column name otherwise.
|
789
|
+
|
790
|
+
a. Try to append the suffix number for the original column name with truncating.
|
791
|
+
b. Fix it if the suffixed name is not duplicated with left columns nor original columns.
|
792
|
+
c. Retry (a) with the suffix number increased otherwise.
|
588
793
|
|
589
794
|
Example
|
590
|
-
|
795
|
+
""""""""
|
796
|
+
|
797
|
+
.. code-block:: yaml
|
798
|
+
|
799
|
+
# This configuration suffixes numbers to duplicative column names. (Ex. ["column", "column", "column"] goes to ["column", "column_2", "column_3"].)
|
800
|
+
filters:
|
801
|
+
...
|
802
|
+
- type: rename
|
803
|
+
rules:
|
804
|
+
- rule: unique_number_suffix
|
805
|
+
|
806
|
+
Example of renaming rules
|
807
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
808
|
+
|
809
|
+
.. code-block:: yaml
|
810
|
+
|
811
|
+
filters:
|
812
|
+
...
|
813
|
+
- type: rename
|
814
|
+
rules:
|
815
|
+
- rule: upper_to_lower # All upper-case are converted to lower-case.
|
816
|
+
- rule: character_types # Only lower-case, digits and "_" are allowed. (No upper-case by the rule ahove.)
|
817
|
+
pass_types: [ "a-z", "0-9" ]
|
818
|
+
pass_characters: "_"
|
819
|
+
- rule: unique_number_suffix # Ensure all column names are unique.
|
820
|
+
|
821
|
+
Columns: not recommended
|
822
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~
|
823
|
+
|
824
|
+
``columns`` is not recommended to use anymore. Consider using ``rules`` instead.
|
591
825
|
|
592
826
|
.. code-block:: yaml
|
593
827
|
|
@@ -598,13 +832,16 @@ Example
|
|
598
832
|
my_existing_column1: new_column1
|
599
833
|
my_existing_column2: new_column2
|
600
834
|
|
835
|
+
.. hint::
|
836
|
+
``columns`` are applied before ``rules`` if ``columns`` and ``rules`` are specified together. (It is discouraged to specify them together, though.)
|
837
|
+
|
601
838
|
Local executor plugin
|
602
|
-
|
839
|
+
----------------------
|
603
840
|
|
604
841
|
The ``local`` executor plugin runs tasks using local threads. This is the only built-in executor plugin.
|
605
842
|
|
606
843
|
Options
|
607
|
-
|
844
|
+
~~~~~~~~
|
608
845
|
|
609
846
|
+------------------+----------+----------------------------------------------------------------------+--------------------------------------+
|
610
847
|
| name | type | description | required? |
|
@@ -620,7 +857,7 @@ The ``max_threads`` option controls maximum concurrency. Setting smaller number
|
|
620
857
|
The ``min_output_tasks`` option enables "page scattering". The feature is enabled if number of input tasks is less than ``min_output_tasks``. It uses multiple filter & output threads for each input task so that one input task can use multiple threads. Setting larger number here is useful if embulk doesn't use multi-threading with enough concurrency due to too few number of input tasks. Setting 1 here disables page scattering completely.
|
621
858
|
|
622
859
|
Example
|
623
|
-
|
860
|
+
~~~~~~~~
|
624
861
|
|
625
862
|
.. code-block:: yaml
|
626
863
|
|
@@ -635,12 +872,12 @@ Example
|
|
635
872
|
...
|
636
873
|
|
637
874
|
Guess executor
|
638
|
-
|
875
|
+
---------------
|
639
876
|
|
640
877
|
The guess executor is called by ``guess`` command. It executes default guess plugins in a sequential order and suggests Embulk config by appropriate guess plugin. The default guess plugins and the order are ``gzip``, ``'bzip2``, ``json`` and ``csv``.
|
641
878
|
|
642
879
|
Options
|
643
|
-
|
880
|
+
~~~~~~~~
|
644
881
|
|
645
882
|
+-----------------------+----------+----------------------------------------------------------------------+--------------------------------------+
|
646
883
|
| name | type | description | required? |
|
@@ -657,7 +894,7 @@ The ``exclude_guess_plugins`` option exclude specified guess plugins from the li
|
|
657
894
|
This example shows how to use ``csv_all_strings`` guess plugin, which suggests column types within CSV files as string types. It needs to be explicitly specified by users when it's used instead of ``csv`` guess plugin because the plugin is not included in default guess plugins. We also can exclude default ``csv`` guess plugin.
|
658
895
|
|
659
896
|
Example
|
660
|
-
|
897
|
+
~~~~~~~~
|
661
898
|
|
662
899
|
.. code-block:: yaml
|
663
900
|
|