embulk 0.8.14-java → 0.8.15-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/build.gradle +3 -3
- data/embulk-core/src/main/java/org/embulk/EmbulkEmbed.java +1 -1
- data/embulk-core/src/main/java/org/embulk/exec/BulkLoader.java +35 -7
- data/embulk-core/src/main/java/org/embulk/exec/ExecModule.java +2 -0
- data/embulk-core/src/main/java/org/embulk/exec/ExecutionResult.java +8 -1
- data/embulk-core/src/main/java/org/embulk/exec/SkipTransactionException.java +23 -0
- data/embulk-docs/src/built-in.rst +299 -62
- data/embulk-docs/src/release.rst +1 -0
- data/embulk-docs/src/release/release-0.8.15.rst +17 -0
- data/embulk-standards/src/main/java/org/embulk/standards/RenameFilterPlugin.java +374 -1
- data/embulk-standards/src/test/java/org/embulk/standards/TestRenameFilterPlugin.java +872 -0
- data/embulk-test/build.gradle +6 -0
- data/embulk-test/src/main/java/org/embulk/test/EmbulkTests.java +75 -0
- data/embulk-test/src/main/java/org/embulk/test/TestingBulkLoader.java +124 -0
- data/embulk-test/src/main/java/org/embulk/test/TestingEmbulk.java +223 -0
- data/lib/embulk/version.rb +1 -1
- data/settings.gradle +1 -0
- metadata +11 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c47e67c0b46d1a8dcd84337dee0d3479fd37f45f
|
4
|
+
data.tar.gz: edcd6bafc9095768db0798dcae9c0d53532f5054
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1946063a900859720ba33919583f1bcfabb543af948b02260f3f91664445c2754aa2014e5b9c3c41cddd800ecb238b2450c39a0520d825d4f03d9d693f7d04f3
|
7
|
+
data.tar.gz: 8348fd5001511a3b718668eccb8b2686fd6cc2cf9635b38f2c941f79b20aa185c83bc01bb535559782895ead8a52b5cdb823c341a177afed7af1dff568e57001
|
data/README.md
CHANGED
data/build.gradle
CHANGED
@@ -11,12 +11,12 @@ apply plugin: "com.github.jruby-gradle.jar"
|
|
11
11
|
apply plugin: 'com.jfrog.bintray'
|
12
12
|
apply plugin: 'com.github.johnrengelman.shadow'
|
13
13
|
|
14
|
-
def java_projects = [project(":embulk-core"), project(":embulk-standards"), project(":embulk-cli")]
|
15
|
-
def release_projects = [project(":embulk-core"), project(":embulk-standards")]
|
14
|
+
def java_projects = [project(":embulk-core"), project(":embulk-standards"), project(":embulk-cli"), project(":embulk-test")]
|
15
|
+
def release_projects = [project(":embulk-core"), project(":embulk-standards"), project(":embulk-test")]
|
16
16
|
|
17
17
|
allprojects {
|
18
18
|
group = 'org.embulk'
|
19
|
-
version = '0.8.
|
19
|
+
version = '0.8.15'
|
20
20
|
|
21
21
|
ext {
|
22
22
|
jrubyVersion = '9.1.5.0'
|
@@ -126,7 +126,7 @@ public class EmbulkEmbed
|
|
126
126
|
{
|
127
127
|
this.injector = injector;
|
128
128
|
injector.getInstance(org.slf4j.ILoggerFactory.class);
|
129
|
-
this.bulkLoader =
|
129
|
+
this.bulkLoader = injector.getInstance(BulkLoader.class);
|
130
130
|
this.guessExecutor = injector.getInstance(GuessExecutor.class);
|
131
131
|
this.previewExecutor = injector.getInstance(PreviewExecutor.class);
|
132
132
|
}
|
@@ -62,7 +62,7 @@ public class BulkLoader
|
|
62
62
|
this.injector = injector;
|
63
63
|
}
|
64
64
|
|
65
|
-
|
65
|
+
protected static class LoaderState
|
66
66
|
implements ProcessState
|
67
67
|
{
|
68
68
|
private final Logger logger;
|
@@ -343,7 +343,17 @@ public class BulkLoader
|
|
343
343
|
ignoredExceptions.add(ex);
|
344
344
|
}
|
345
345
|
|
346
|
-
return new ExecutionResult(configDiff, ignoredExceptions.build());
|
346
|
+
return new ExecutionResult(configDiff, false, ignoredExceptions.build());
|
347
|
+
}
|
348
|
+
|
349
|
+
public ExecutionResult buildExecuteResultOfSkippedExecution(ConfigDiff configDiff)
|
350
|
+
{
|
351
|
+
ImmutableList.Builder<Throwable> ignoredExceptions = ImmutableList.builder();
|
352
|
+
for (Throwable e : getExceptions()) {
|
353
|
+
ignoredExceptions.add(e);
|
354
|
+
}
|
355
|
+
|
356
|
+
return new ExecutionResult(configDiff, true, ignoredExceptions.build());
|
347
357
|
}
|
348
358
|
|
349
359
|
public ResumeState buildResumeState(ExecSession exec)
|
@@ -364,6 +374,11 @@ public class BulkLoader
|
|
364
374
|
}
|
365
375
|
}
|
366
376
|
|
377
|
+
protected LoaderState newLoaderState(Logger logger, ProcessPluginSet plugins)
|
378
|
+
{
|
379
|
+
return new LoaderState(logger, plugins);
|
380
|
+
}
|
381
|
+
|
367
382
|
public ExecutionResult run(ExecSession exec, final ConfigSource config)
|
368
383
|
{
|
369
384
|
try {
|
@@ -418,7 +433,7 @@ public class BulkLoader
|
|
418
433
|
}
|
419
434
|
}
|
420
435
|
|
421
|
-
|
436
|
+
protected static class ProcessPluginSet
|
422
437
|
{
|
423
438
|
private final PluginType inputPluginType;
|
424
439
|
private final PluginType outputPluginType;
|
@@ -507,7 +522,7 @@ public class BulkLoader
|
|
507
522
|
final ExecutorPlugin exec = newExecutorPlugin(task);
|
508
523
|
final ProcessPluginSet plugins = new ProcessPluginSet(task);
|
509
524
|
|
510
|
-
final LoaderState state =
|
525
|
+
final LoaderState state = newLoaderState(Exec.getLogger(BulkLoader.class), plugins);
|
511
526
|
state.setTransactionStage(TransactionStage.INPUT_BEGIN);
|
512
527
|
try {
|
513
528
|
ConfigDiff inputConfigDiff = plugins.getInputPlugin().transaction(task.getInputConfig(), new InputPlugin.Control() {
|
@@ -565,7 +580,11 @@ public class BulkLoader
|
|
565
580
|
return state.buildExecuteResult();
|
566
581
|
|
567
582
|
} catch (Throwable ex) {
|
568
|
-
if (
|
583
|
+
if (isSkippedTransaction(ex)) {
|
584
|
+
ConfigDiff configDiff = ((SkipTransactionException) ex).getConfigDiff();
|
585
|
+
return state.buildExecuteResultOfSkippedExecution(configDiff);
|
586
|
+
}
|
587
|
+
else if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
|
569
588
|
// ignore the exception
|
570
589
|
return state.buildExecuteResultWithWarningException(ex);
|
571
590
|
}
|
@@ -580,7 +599,7 @@ public class BulkLoader
|
|
580
599
|
final ExecutorPlugin exec = newExecutorPlugin(task);
|
581
600
|
final ProcessPluginSet plugins = new ProcessPluginSet(task);
|
582
601
|
|
583
|
-
final LoaderState state =
|
602
|
+
final LoaderState state = newLoaderState(Exec.getLogger(BulkLoader.class), plugins);
|
584
603
|
state.setTransactionStage(TransactionStage.INPUT_BEGIN);
|
585
604
|
try {
|
586
605
|
ConfigDiff inputConfigDiff = plugins.getInputPlugin().resume(resume.getInputTaskSource(), resume.getInputSchema(), resume.getInputTaskReports().size(), new InputPlugin.Control() {
|
@@ -642,7 +661,11 @@ public class BulkLoader
|
|
642
661
|
return state.buildExecuteResult();
|
643
662
|
|
644
663
|
} catch (Throwable ex) {
|
645
|
-
if (
|
664
|
+
if (isSkippedTransaction(ex)) {
|
665
|
+
ConfigDiff configDiff = ((SkipTransactionException) ex).getConfigDiff();
|
666
|
+
return state.buildExecuteResultOfSkippedExecution(configDiff);
|
667
|
+
}
|
668
|
+
else if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
|
646
669
|
// ignore the exception
|
647
670
|
return state.buildExecuteResultWithWarningException(ex);
|
648
671
|
}
|
@@ -650,6 +673,11 @@ public class BulkLoader
|
|
650
673
|
}
|
651
674
|
}
|
652
675
|
|
676
|
+
private static boolean isSkippedTransaction(Throwable ex)
|
677
|
+
{
|
678
|
+
return ex instanceof SkipTransactionException;
|
679
|
+
}
|
680
|
+
|
653
681
|
private static void restoreResumedTaskReports(ResumeState resume, LoaderState state)
|
654
682
|
{
|
655
683
|
int inputTaskCount = resume.getInputTaskReports().size();
|
@@ -26,6 +26,8 @@ public class ExecModule
|
|
26
26
|
{
|
27
27
|
Preconditions.checkNotNull(binder, "binder is null.");
|
28
28
|
|
29
|
+
binder.bind(BulkLoader.class);
|
30
|
+
|
29
31
|
binder.bind(ILoggerFactory.class).toProvider(LoggerProvider.class).in(Scopes.SINGLETON);
|
30
32
|
binder.bind(ModelManager.class).in(Scopes.SINGLETON);
|
31
33
|
binder.bind(BufferAllocator.class).to(PooledBufferAllocator.class).in(Scopes.SINGLETON);
|
@@ -6,11 +6,13 @@ import org.embulk.config.ConfigDiff;
|
|
6
6
|
public class ExecutionResult
|
7
7
|
{
|
8
8
|
private final ConfigDiff configDiff;
|
9
|
+
private final boolean skipped;
|
9
10
|
private final List<Throwable> ignoredExceptions;
|
10
11
|
|
11
|
-
public ExecutionResult(ConfigDiff configDiff, List<Throwable> ignoredExceptions)
|
12
|
+
public ExecutionResult(ConfigDiff configDiff, boolean skipped, List<Throwable> ignoredExceptions)
|
12
13
|
{
|
13
14
|
this.configDiff = configDiff;
|
15
|
+
this.skipped = skipped;
|
14
16
|
this.ignoredExceptions = ignoredExceptions;
|
15
17
|
}
|
16
18
|
|
@@ -19,6 +21,11 @@ public class ExecutionResult
|
|
19
21
|
return configDiff;
|
20
22
|
}
|
21
23
|
|
24
|
+
public boolean isSkipped()
|
25
|
+
{
|
26
|
+
return skipped;
|
27
|
+
}
|
28
|
+
|
22
29
|
public List<Throwable> getIgnoredExceptions()
|
23
30
|
{
|
24
31
|
return ignoredExceptions;
|
@@ -0,0 +1,23 @@
|
|
1
|
+
package org.embulk.exec;
|
2
|
+
|
3
|
+
import org.embulk.config.ConfigDiff;
|
4
|
+
|
5
|
+
// Input/output plugins might need to stop Embulk before the transaction starts by depending
|
6
|
+
// on the conditions of input/output data sources/destinations. They can throw this exception
|
7
|
+
// if they want to do that. Embulk handles it and then stops the transaction.
|
8
|
+
public class SkipTransactionException
|
9
|
+
extends RuntimeException
|
10
|
+
{
|
11
|
+
private final ConfigDiff configDiff;
|
12
|
+
|
13
|
+
public SkipTransactionException(ConfigDiff configDiff)
|
14
|
+
{
|
15
|
+
super();
|
16
|
+
this.configDiff = configDiff;
|
17
|
+
}
|
18
|
+
|
19
|
+
public ConfigDiff getConfigDiff()
|
20
|
+
{
|
21
|
+
return configDiff;
|
22
|
+
}
|
23
|
+
}
|
@@ -1,12 +1,12 @@
|
|
1
1
|
Configuration
|
2
|
-
|
2
|
+
==============
|
3
3
|
|
4
4
|
.. contents::
|
5
5
|
:local:
|
6
6
|
:depth: 2
|
7
7
|
|
8
8
|
Embulk configuration file format
|
9
|
-
|
9
|
+
---------------------------------
|
10
10
|
|
11
11
|
Embulk uses a YAML file to define a bulk data loading. Here is an example of the file:
|
12
12
|
|
@@ -60,7 +60,7 @@ In many cases, what you need to write is **in:**, **out**: and **formatter** sec
|
|
60
60
|
|
61
61
|
|
62
62
|
Using variables
|
63
|
-
|
63
|
+
~~~~~~~~~~~~~~~~
|
64
64
|
|
65
65
|
You can embed environment variables in configuration file using `Liquid template engine <http://liquidmarkup.org/>`_ (This is experimental feature. Behavior might change or be removed in future releases).
|
66
66
|
|
@@ -89,7 +89,7 @@ Environment variables are set to ``env`` variable.
|
|
89
89
|
|
90
90
|
|
91
91
|
Including files
|
92
|
-
|
92
|
+
~~~~~~~~~~~~~~~~
|
93
93
|
|
94
94
|
Configuration file can include another configuration file. To use it, configuration file name must end with ``.yml.liquid``.
|
95
95
|
|
@@ -121,12 +121,12 @@ With above 2 files, actual configuration file will be:
|
|
121
121
|
|
122
122
|
|
123
123
|
Local file input plugin
|
124
|
-
|
124
|
+
------------------------
|
125
125
|
|
126
126
|
The ``file`` input plugin reads files from local file system.
|
127
127
|
|
128
128
|
Options
|
129
|
-
|
129
|
+
~~~~~~~~
|
130
130
|
|
131
131
|
+----------------+----------+------------------------------------------------+-----------+
|
132
132
|
| name | type | description | required? |
|
@@ -168,7 +168,7 @@ For example, if you set ``last_path: /path/to/files/sample_02.csv``, Embulk read
|
|
168
168
|
|-- sample_04.csv -> read
|
169
169
|
|
170
170
|
Example
|
171
|
-
|
171
|
+
~~~~~~~~
|
172
172
|
|
173
173
|
.. code-block:: yaml
|
174
174
|
|
@@ -187,7 +187,7 @@ CSV parser plugin
|
|
187
187
|
The ``csv`` parser plugin parses CSV and TSV files.
|
188
188
|
|
189
189
|
Options
|
190
|
-
|
190
|
+
~~~~~~~~
|
191
191
|
|
192
192
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
|
193
193
|
| name | type | description | required? |
|
@@ -271,7 +271,7 @@ The ``null_string`` option converts certain values to NULL. Values will be conve
|
|
271
271
|
You can use ``guess`` to automatically generate the column settings. See also `Quick Start <https://github.com/embulk/embulk#quick-start>`_.
|
272
272
|
|
273
273
|
Example
|
274
|
-
|
274
|
+
~~~~~~~~
|
275
275
|
|
276
276
|
.. code-block:: yaml
|
277
277
|
|
@@ -295,8 +295,30 @@ Example
|
|
295
295
|
- {name: comment, type: string}
|
296
296
|
|
297
297
|
|
298
|
+
.. note::
|
299
|
+
|
300
|
+
CSV parser supports ``format: '%s'`` to parse UNIX timestamp in seconds (e.g. 1470148959) as timestamp.
|
301
|
+
|
302
|
+
However, CSV parser itself can't parse UNIX timestamp in millisecond (e.g. 1470148959542) as timestamp. You can still parse the column as ``long`` type first, then apply `timestamp_format <https://github.com/sonots/embulk-filter-timestamp_format>`_ filter plugin to convert long to timestamp. Here is an example:
|
303
|
+
|
304
|
+
.. code-block:: yaml
|
305
|
+
|
306
|
+
in:
|
307
|
+
type: file
|
308
|
+
path_prefix: /my_csv_files
|
309
|
+
parser:
|
310
|
+
...
|
311
|
+
columns:
|
312
|
+
- {name: timestamp_in_seconds, type: timestamp, format: '%s'}
|
313
|
+
- {name: timestamp_in_millis, type: long}
|
314
|
+
filters:
|
315
|
+
- type: timestamp_format
|
316
|
+
columns:
|
317
|
+
- {name: timestamp_in_millis, from_unit: ms}
|
318
|
+
|
319
|
+
|
298
320
|
JSON parser plugin
|
299
|
-
|
321
|
+
-------------------
|
300
322
|
|
301
323
|
The ``json`` parser plugin parses a JSON file that contains a sequence of JSON objects. Example:
|
302
324
|
|
@@ -310,7 +332,7 @@ The ``json`` parser plugin parses a JSON file that contains a sequence of JSON o
|
|
310
332
|
``json`` parser plugin outputs a single record named "record" (type is json).
|
311
333
|
|
312
334
|
Options
|
313
|
-
|
335
|
+
~~~~~~~~
|
314
336
|
|
315
337
|
+----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
|
316
338
|
| name | type | description | required? |
|
@@ -320,7 +342,7 @@ Options
|
|
320
342
|
|
321
343
|
|
322
344
|
Example
|
323
|
-
|
345
|
+
~~~~~~~~
|
324
346
|
|
325
347
|
.. code-block:: yaml
|
326
348
|
|
@@ -329,17 +351,17 @@ Example
|
|
329
351
|
type: json
|
330
352
|
|
331
353
|
Gzip decoder plugin
|
332
|
-
|
354
|
+
--------------------
|
333
355
|
|
334
356
|
The ``gzip`` decoder plugin decompresses gzip files before input plugins read them.
|
335
357
|
|
336
358
|
Options
|
337
|
-
|
359
|
+
~~~~~~~~
|
338
360
|
|
339
361
|
This plugin doesn't have any options.
|
340
362
|
|
341
363
|
Example
|
342
|
-
|
364
|
+
~~~~~~~~
|
343
365
|
|
344
366
|
.. code-block:: yaml
|
345
367
|
|
@@ -350,17 +372,17 @@ Example
|
|
350
372
|
|
351
373
|
|
352
374
|
BZip2 decoder plugin
|
353
|
-
|
375
|
+
---------------------
|
354
376
|
|
355
377
|
The ``bzip2`` decoder plugin decompresses bzip2 files before input plugins read them.
|
356
378
|
|
357
379
|
Options
|
358
|
-
|
380
|
+
~~~~~~~~
|
359
381
|
|
360
382
|
This plugin doesn't have any options.
|
361
383
|
|
362
384
|
Example
|
363
|
-
|
385
|
+
~~~~~~~~
|
364
386
|
|
365
387
|
.. code-block:: yaml
|
366
388
|
|
@@ -371,12 +393,12 @@ Example
|
|
371
393
|
|
372
394
|
|
373
395
|
File output plugin
|
374
|
-
|
396
|
+
-------------------
|
375
397
|
|
376
398
|
The ``file`` output plugin writes records to local file system.
|
377
399
|
|
378
400
|
Options
|
379
|
-
|
401
|
+
~~~~~~~~
|
380
402
|
|
381
403
|
+--------------------+----------+---------------------------------------------------+----------------------------+
|
382
404
|
| name | type | description | required? |
|
@@ -404,7 +426,7 @@ For example, if you set ``path_prefix: /path/to/output/sample_``, ``sequence_for
|
|
404
426
|
``sequence_format`` formats task index and sequence number in a task.
|
405
427
|
|
406
428
|
Example
|
407
|
-
|
429
|
+
~~~~~~~~
|
408
430
|
|
409
431
|
.. code-block:: yaml
|
410
432
|
|
@@ -416,12 +438,12 @@ Example
|
|
416
438
|
...
|
417
439
|
|
418
440
|
CSV formatter plugin
|
419
|
-
|
441
|
+
---------------------
|
420
442
|
|
421
443
|
The ``csv`` formatter plugin formats records using CSV or TSV format.
|
422
444
|
|
423
445
|
Options
|
424
|
-
|
446
|
+
~~~~~~~~
|
425
447
|
|
426
448
|
+----------------------+---------+-------------------------------------------------------------------------------------------------------+-------------------------------+
|
427
449
|
| name | type | description | required? |
|
@@ -474,7 +496,7 @@ The ``column_options`` option is a map whose keys are name of columns, and value
|
|
474
496
|
+----------------------+---------+-------------------------------------------------------------------------------------------------------+-----------------------------------------+
|
475
497
|
|
476
498
|
Example
|
477
|
-
|
499
|
+
~~~~~~~~
|
478
500
|
|
479
501
|
.. code-block:: yaml
|
480
502
|
|
@@ -496,12 +518,12 @@ Example
|
|
496
518
|
mycol2: {format: '%Y-%m-%d %H:%M:%S', timezone: 'America/Los_Angeles'}
|
497
519
|
|
498
520
|
Gzip encoder plugin
|
499
|
-
|
521
|
+
--------------------
|
500
522
|
|
501
523
|
The ``gzip`` encoder plugin compresses output files using gzip.
|
502
524
|
|
503
525
|
Options
|
504
|
-
|
526
|
+
~~~~~~~~
|
505
527
|
|
506
528
|
+---------+----------+----------------------------------------------------------------------+--------------------+
|
507
529
|
| name | type | description | required? |
|
@@ -510,7 +532,7 @@ Options
|
|
510
532
|
+---------+----------+----------------------------------------------------------------------+--------------------+
|
511
533
|
|
512
534
|
Example
|
513
|
-
|
535
|
+
~~~~~~~~
|
514
536
|
|
515
537
|
.. code-block:: yaml
|
516
538
|
|
@@ -521,73 +543,285 @@ Example
|
|
521
543
|
level: 1
|
522
544
|
|
523
545
|
|
524
|
-
|
525
|
-
|
546
|
+
BZip2 encoder plugin
|
547
|
+
---------------------
|
526
548
|
|
527
|
-
The ``
|
549
|
+
The ``bzip2`` encoder plugin compresses output files using bzip2.
|
528
550
|
|
529
551
|
Options
|
530
|
-
|
552
|
+
~~~~~~~~
|
531
553
|
|
532
554
|
+---------+----------+----------------------------------------------------------------------+--------------------+
|
533
555
|
| name | type | description | required? |
|
534
556
|
+=========+==========+======================================================================+====================+
|
535
|
-
| level | integer | Compression level. From
|
557
|
+
| level | integer | Compression level. From 1 to 9 (best compression). | ``9`` by default |
|
536
558
|
+---------+----------+----------------------------------------------------------------------+--------------------+
|
537
559
|
|
538
560
|
Example
|
539
|
-
|
561
|
+
~~~~~~~~
|
540
562
|
|
541
563
|
.. code-block:: yaml
|
542
564
|
|
543
565
|
out:
|
544
566
|
...
|
545
567
|
encoders:
|
546
|
-
- type:
|
547
|
-
level:
|
568
|
+
- type: bzip2
|
569
|
+
level: 6
|
548
570
|
|
549
|
-
BZip2 encoder plugin
|
550
|
-
------------------
|
551
571
|
|
552
|
-
|
572
|
+
Rename filter plugin
|
573
|
+
---------------------
|
574
|
+
|
575
|
+
The ``rename`` filter plugin changes column names. This plugin has no impact on performance.
|
553
576
|
|
554
577
|
Options
|
555
|
-
|
578
|
+
~~~~~~~~
|
556
579
|
|
557
580
|
+---------+----------+----------------------------------------------------------------------+--------------------+
|
558
581
|
| name | type | description | required? |
|
559
582
|
+=========+==========+======================================================================+====================+
|
560
|
-
|
|
583
|
+
| rules | array | An array of rule-based renaming operations. (See below for rules.) | ``[]`` by default |
|
584
|
+
+---------+----------+----------------------------------------------------------------------+--------------------+
|
585
|
+
| columns | hash | A map whose keys are existing column names. values are new names. | ``{}`` by default |
|
561
586
|
+---------+----------+----------------------------------------------------------------------+--------------------+
|
562
587
|
|
588
|
+
Renaming rules
|
589
|
+
~~~~~~~~~~~~~~~
|
590
|
+
|
591
|
+
The ``rules`` is an array of rules as below applied top-down for all the columns.
|
592
|
+
|
593
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
594
|
+
| rule | description |
|
595
|
+
+=========================+========================================================================================+
|
596
|
+
| character\_types | Restrict characters by types. Replace restricted characteres. |
|
597
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
598
|
+
| first\_character\_types | Restrict the first character by types. Prefix or replace first restricted characters. |
|
599
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
600
|
+
| lower\_to\_upper | Convert lower-case alphabets to upper-case. |
|
601
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
602
|
+
| regex\_replace | Replace with a regular expressions. |
|
603
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
604
|
+
| truncate | Truncate. |
|
605
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
606
|
+
| upper\_to\_lower | Convert upper-case alphabets to lower-case |
|
607
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
608
|
+
| unique\_number\_suffix | Make column names unique in the schema. |
|
609
|
+
+-------------------------+----------------------------------------------------------------------------------------+
|
610
|
+
|
611
|
+
Renaming rule: character\_types
|
612
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
613
|
+
|
614
|
+
The rule ``character_types`` replaces restricted characters.
|
615
|
+
|
616
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
|
617
|
+
| option | description | required? |
|
618
|
+
+===================+============================================================================================================================================+====================+
|
619
|
+
| pass\_characteres | Characters to be allowed. | ``""`` by default |
|
620
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
|
621
|
+
| pass\_types | Sets of characters to be allowed. The array must consist of "a-z" (lower-case alphabets), "A-Z" (upper-case alphabets), or "0-9" (digits). | ``[]`` by default |
|
622
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
|
623
|
+
| replace | A character that disallowed characters are replaced with. It must consist of just 1 character. | ``"_"`` by default |
|
624
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
|
625
|
+
|
563
626
|
Example
|
564
|
-
|
627
|
+
""""""""
|
565
628
|
|
566
629
|
.. code-block:: yaml
|
567
630
|
|
568
|
-
|
631
|
+
# This configuration replaces characters into "_" except for "_", lower-case alphabets, and digits.
|
632
|
+
filters:
|
569
633
|
...
|
570
|
-
|
571
|
-
|
572
|
-
|
634
|
+
- type: rename
|
635
|
+
rules:
|
636
|
+
- rule: character_types
|
637
|
+
pass_characters: "_"
|
638
|
+
pass_types: [ "a-z", "0-9" ]
|
573
639
|
|
574
640
|
|
575
|
-
|
576
|
-
|
641
|
+
Renaming rule: first\_character\_types
|
642
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
577
643
|
|
578
|
-
The ``
|
644
|
+
The rule ``first_character_types`` prefixes or replaces a restricted character at the beginning.
|
579
645
|
|
580
|
-
|
581
|
-
|
646
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
|
647
|
+
| option | description | required? |
|
648
|
+
+===================+============================================================================================================================================+==============================================+
|
649
|
+
| pass\_characteres | Characters to be allowed. | ``""`` by default |
|
650
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
|
651
|
+
| pass\_types | Sets of characters to be allowed. The array must consist of "a-z" (lower-case alphabets), "A-Z" (upper-case alphabets), or "0-9" (digits). | ``[]`` by default |
|
652
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
|
653
|
+
| prefix | A character that a disallowed first character is replaced with. | one of ``prefix`` or ``replace`` is required |
|
654
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
|
655
|
+
| replace | A character that a disallowed first character is prefixed with. | one of ``prefix`` or ``replace`` is required |
|
656
|
+
+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
|
582
657
|
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
658
|
+
Example
|
659
|
+
""""""""
|
660
|
+
|
661
|
+
.. code-block:: yaml
|
662
|
+
|
663
|
+
# This configuration prefixes a column name with "_" unless the name starts from "_" or a lower-case alphabet.
|
664
|
+
filters:
|
665
|
+
...
|
666
|
+
- type: rename
|
667
|
+
rules:
|
668
|
+
- rule: first_character_types
|
669
|
+
pass_characters: "_"
|
670
|
+
pass_types: [ "a-z" ]
|
671
|
+
prefix: "_"
|
672
|
+
|
673
|
+
Renaming rule: lower\_to\_upper
|
674
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
675
|
+
|
676
|
+
The rule ``lower_to_upper`` converts lower-case alphabets to upper-case.
|
677
|
+
|
678
|
+
Example
|
679
|
+
""""""""
|
680
|
+
|
681
|
+
.. code-block:: yaml
|
682
|
+
|
683
|
+
# This configuration converts all lower-case alphabets to upper-case.
|
684
|
+
filters:
|
685
|
+
...
|
686
|
+
- type: rename
|
687
|
+
rules:
|
688
|
+
- rule: lower_to_upper
|
689
|
+
|
690
|
+
|
691
|
+
Renaming rule: regex\_replace
|
692
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
693
|
+
|
694
|
+
The rule ``regex_replace`` replaces column names based on a regular expression.
|
695
|
+
|
696
|
+
+---------+--------------------------------------------------------------------------------------------------------------------------------------+-----------+
|
697
|
+
| option | description | required? |
|
698
|
+
+=========+======================================================================================================================================+===========+
|
699
|
+
| match | A `Java-style regular expression <https://docs.oracle.com/javase/tutorial/essential/regex/>`_ to which this string is to be matched. | required |
|
700
|
+
+---------+--------------------------------------------------------------------------------------------------------------------------------------+-----------+
|
701
|
+
| replace | A string to be substibuted for each match in Java-style. | required |
|
702
|
+
+---------+--------------------------------------------------------------------------------------------------------------------------------------+-----------+
|
703
|
+
|
704
|
+
Example
|
705
|
+
""""""""
|
706
|
+
|
707
|
+
.. code-block:: yaml
|
708
|
+
|
709
|
+
# This configuration replaces all patterns
|
710
|
+
filters:
|
711
|
+
...
|
712
|
+
- type: rename
|
713
|
+
rules:
|
714
|
+
- rule: regex_replace
|
715
|
+
match: "([0-9]+)_dollars"
|
716
|
+
replace: "USD$1"
|
717
|
+
|
718
|
+
|
719
|
+
Renaming rule: truncate
|
720
|
+
^^^^^^^^^^^^^^^^^^^^^^^^
|
721
|
+
|
722
|
+
The rule ``truncate`` truncates column names.
|
723
|
+
|
724
|
+
+------------+-----------------------------------------------------+--------------------+
|
725
|
+
| option | description | required? |
|
726
|
+
+============+=====================================================+====================+
|
727
|
+
| max_length | The length to which the column names are truncated. | ``128`` by default |
|
728
|
+
+------------+-----------------------------------------------------+--------------------+
|
729
|
+
|
730
|
+
Example
|
731
|
+
""""""""
|
732
|
+
|
733
|
+
.. code-block:: yaml
|
734
|
+
|
735
|
+
# This configuration drops all characters after the 20th character.
|
736
|
+
filters:
|
737
|
+
...
|
738
|
+
- type: rename
|
739
|
+
rules:
|
740
|
+
- rule: truncate
|
741
|
+
max_length: 20
|
742
|
+
|
743
|
+
Renaming rule: upper\_to\_lower
|
744
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
745
|
+
|
746
|
+
The rule ``upper_to_lower`` converts upper-case alphabets to lower-case.
|
747
|
+
|
748
|
+
Example
|
749
|
+
""""""""
|
750
|
+
|
751
|
+
.. code-block:: yaml
|
752
|
+
|
753
|
+
# This configuration converts all upper-case alphabets to lower-case.
|
754
|
+
filters:
|
755
|
+
...
|
756
|
+
- type: rename
|
757
|
+
rules:
|
758
|
+
- rule: upper_to_lower
|
759
|
+
|
760
|
+
Renaming rule: unique\_number\_suffix
|
761
|
+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
762
|
+
|
763
|
+
The rule ``unique_number_suffix`` makes column names unique in the schema by suffixing numbers.
|
764
|
+
|
765
|
+
+------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
|
766
|
+
| option | description | required? |
|
767
|
+
+============+=============================================================================================================================+====================+
|
768
|
+
| delimiter | A delimiter character inserted before a suffix number. It must be just 1 non-digit character. | ``"_"`` by default |
|
769
|
+
+------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
|
770
|
+
| digits | An integer that specifies the number of zero-filled digits of a suffix number. The suffix number zero-filled to the digits. | optional |
|
771
|
+
+------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
|
772
|
+
| max_length | The length to which the column names are truncated. The column name is truncated before the suffix number. | optional |
|
773
|
+
+------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
|
774
|
+
| offset | An integer where the suffix number starts. The first duplicative column name is suffixed by (```offset``` + 1). | ``1`` by default |
|
775
|
+
+------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
|
776
|
+
|
777
|
+
.. hint::
|
778
|
+
The procedure to make column names unique is not very trivial. There are many feasible ways. This renaming rule works as follows:
|
779
|
+
|
780
|
+
Basic policies:
|
781
|
+
|
782
|
+
* Suffix numbers are counted per original column name.
|
783
|
+
* Column names are fixed from the first column to the last column.
|
784
|
+
|
785
|
+
Actual procedure applied from the first (leftmost) column to the last (rightmost) column:
|
786
|
+
|
787
|
+
1. Fix the column name as-is with truncating if the truncated name is not duplicated with left columns.
|
788
|
+
2. Suffix the column name otherwise.
|
789
|
+
|
790
|
+
a. Try to append the suffix number for the original column name with truncating.
|
791
|
+
b. Fix it if the suffixed name is not duplicated with left columns nor original columns.
|
792
|
+
c. Retry (a) with the suffix number increased otherwise.
|
588
793
|
|
589
794
|
Example
|
590
|
-
|
795
|
+
""""""""
|
796
|
+
|
797
|
+
.. code-block:: yaml
|
798
|
+
|
799
|
+
# This configuration suffixes numbers to duplicative column names. (Ex. ["column", "column", "column"] goes to ["column", "column_2", "column_3"].)
|
800
|
+
filters:
|
801
|
+
...
|
802
|
+
- type: rename
|
803
|
+
rules:
|
804
|
+
- rule: unique_number_suffix
|
805
|
+
|
806
|
+
Example of renaming rules
|
807
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
808
|
+
|
809
|
+
.. code-block:: yaml
|
810
|
+
|
811
|
+
filters:
|
812
|
+
...
|
813
|
+
- type: rename
|
814
|
+
rules:
|
815
|
+
- rule: upper_to_lower # All upper-case are converted to lower-case.
|
816
|
+
- rule: character_types # Only lower-case, digits and "_" are allowed. (No upper-case by the rule ahove.)
|
817
|
+
pass_types: [ "a-z", "0-9" ]
|
818
|
+
pass_characters: "_"
|
819
|
+
- rule: unique_number_suffix # Ensure all column names are unique.
|
820
|
+
|
821
|
+
Columns: not recommended
|
822
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~
|
823
|
+
|
824
|
+
``columns`` is not recommended to use anymore. Consider using ``rules`` instead.
|
591
825
|
|
592
826
|
.. code-block:: yaml
|
593
827
|
|
@@ -598,13 +832,16 @@ Example
|
|
598
832
|
my_existing_column1: new_column1
|
599
833
|
my_existing_column2: new_column2
|
600
834
|
|
835
|
+
.. hint::
|
836
|
+
``columns`` are applied before ``rules`` if ``columns`` and ``rules`` are specified together. (It is discouraged to specify them together, though.)
|
837
|
+
|
601
838
|
Local executor plugin
|
602
|
-
|
839
|
+
----------------------
|
603
840
|
|
604
841
|
The ``local`` executor plugin runs tasks using local threads. This is the only built-in executor plugin.
|
605
842
|
|
606
843
|
Options
|
607
|
-
|
844
|
+
~~~~~~~~
|
608
845
|
|
609
846
|
+------------------+----------+----------------------------------------------------------------------+--------------------------------------+
|
610
847
|
| name | type | description | required? |
|
@@ -620,7 +857,7 @@ The ``max_threads`` option controls maximum concurrency. Setting smaller number
|
|
620
857
|
The ``min_output_tasks`` option enables "page scattering". The feature is enabled if number of input tasks is less than ``min_output_tasks``. It uses multiple filter & output threads for each input task so that one input task can use multiple threads. Setting larger number here is useful if embulk doesn't use multi-threading with enough concurrency due to too few number of input tasks. Setting 1 here disables page scattering completely.
|
621
858
|
|
622
859
|
Example
|
623
|
-
|
860
|
+
~~~~~~~~
|
624
861
|
|
625
862
|
.. code-block:: yaml
|
626
863
|
|
@@ -635,12 +872,12 @@ Example
|
|
635
872
|
...
|
636
873
|
|
637
874
|
Guess executor
|
638
|
-
|
875
|
+
---------------
|
639
876
|
|
640
877
|
The guess executor is called by ``guess`` command. It executes default guess plugins in a sequential order and suggests Embulk config by appropriate guess plugin. The default guess plugins and the order are ``gzip``, ``'bzip2``, ``json`` and ``csv``.
|
641
878
|
|
642
879
|
Options
|
643
|
-
|
880
|
+
~~~~~~~~
|
644
881
|
|
645
882
|
+-----------------------+----------+----------------------------------------------------------------------+--------------------------------------+
|
646
883
|
| name | type | description | required? |
|
@@ -657,7 +894,7 @@ The ``exclude_guess_plugins`` option exclude specified guess plugins from the li
|
|
657
894
|
This example shows how to use ``csv_all_strings`` guess plugin, which suggests column types within CSV files as string types. It needs to be explicitly specified by users when it's used instead of ``csv`` guess plugin because the plugin is not included in default guess plugins. We also can exclude default ``csv`` guess plugin.
|
658
895
|
|
659
896
|
Example
|
660
|
-
|
897
|
+
~~~~~~~~
|
661
898
|
|
662
899
|
.. code-block:: yaml
|
663
900
|
|