embulk 0.8.14-java → 0.8.15-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 353b653ad68c876426f467ef64c82698598e9a59
4
- data.tar.gz: 78a19ab05e47c9334267c02e4b4ea7ce2a0a3467
3
+ metadata.gz: c47e67c0b46d1a8dcd84337dee0d3479fd37f45f
4
+ data.tar.gz: edcd6bafc9095768db0798dcae9c0d53532f5054
5
5
  SHA512:
6
- metadata.gz: 527b918b1809e5e04cbe5eaf51efd589d753589aa1569168dd8801cc89a680388aa2ef13c4b3041a99ced2637a9ff2751db3248eca05ae34a5ec3e6228af46b7
7
- data.tar.gz: cf8ffc06d1e43b2b85c68d9f815b99265c2b48e65460ff97b05a8f6e73297d197390f5e7a850a01ff5482415d725a31bcca3ffd769c38ffac84492c7b1afd60a
6
+ metadata.gz: 1946063a900859720ba33919583f1bcfabb543af948b02260f3f91664445c2754aa2014e5b9c3c41cddd800ecb238b2450c39a0520d825d4f03d9d693f7d04f3
7
+ data.tar.gz: 8348fd5001511a3b718668eccb8b2686fd6cc2cf9635b38f2c941f79b20aa185c83bc01bb535559782895ead8a52b5cdb823c341a177afed7af1dff568e57001
data/README.md CHANGED
@@ -204,7 +204,7 @@ Run following commands and follow its instruction:
204
204
 
205
205
  ```
206
206
  ./gradlew releaseCheck
207
- ./gradlew release
207
+ ./gradlew clean cli gem && ./gradlew release
208
208
  git commit -am v$VERSION
209
209
  git tag v$VERSION
210
210
  ```
@@ -11,12 +11,12 @@ apply plugin: "com.github.jruby-gradle.jar"
11
11
  apply plugin: 'com.jfrog.bintray'
12
12
  apply plugin: 'com.github.johnrengelman.shadow'
13
13
 
14
- def java_projects = [project(":embulk-core"), project(":embulk-standards"), project(":embulk-cli")]
15
- def release_projects = [project(":embulk-core"), project(":embulk-standards")]
14
+ def java_projects = [project(":embulk-core"), project(":embulk-standards"), project(":embulk-cli"), project(":embulk-test")]
15
+ def release_projects = [project(":embulk-core"), project(":embulk-standards"), project(":embulk-test")]
16
16
 
17
17
  allprojects {
18
18
  group = 'org.embulk'
19
- version = '0.8.14'
19
+ version = '0.8.15'
20
20
 
21
21
  ext {
22
22
  jrubyVersion = '9.1.5.0'
@@ -126,7 +126,7 @@ public class EmbulkEmbed
126
126
  {
127
127
  this.injector = injector;
128
128
  injector.getInstance(org.slf4j.ILoggerFactory.class);
129
- this.bulkLoader = new BulkLoader(injector, systemConfig);
129
+ this.bulkLoader = injector.getInstance(BulkLoader.class);
130
130
  this.guessExecutor = injector.getInstance(GuessExecutor.class);
131
131
  this.previewExecutor = injector.getInstance(PreviewExecutor.class);
132
132
  }
@@ -62,7 +62,7 @@ public class BulkLoader
62
62
  this.injector = injector;
63
63
  }
64
64
 
65
- private static class LoaderState
65
+ protected static class LoaderState
66
66
  implements ProcessState
67
67
  {
68
68
  private final Logger logger;
@@ -343,7 +343,17 @@ public class BulkLoader
343
343
  ignoredExceptions.add(ex);
344
344
  }
345
345
 
346
- return new ExecutionResult(configDiff, ignoredExceptions.build());
346
+ return new ExecutionResult(configDiff, false, ignoredExceptions.build());
347
+ }
348
+
349
+ public ExecutionResult buildExecuteResultOfSkippedExecution(ConfigDiff configDiff)
350
+ {
351
+ ImmutableList.Builder<Throwable> ignoredExceptions = ImmutableList.builder();
352
+ for (Throwable e : getExceptions()) {
353
+ ignoredExceptions.add(e);
354
+ }
355
+
356
+ return new ExecutionResult(configDiff, true, ignoredExceptions.build());
347
357
  }
348
358
 
349
359
  public ResumeState buildResumeState(ExecSession exec)
@@ -364,6 +374,11 @@ public class BulkLoader
364
374
  }
365
375
  }
366
376
 
377
+ protected LoaderState newLoaderState(Logger logger, ProcessPluginSet plugins)
378
+ {
379
+ return new LoaderState(logger, plugins);
380
+ }
381
+
367
382
  public ExecutionResult run(ExecSession exec, final ConfigSource config)
368
383
  {
369
384
  try {
@@ -418,7 +433,7 @@ public class BulkLoader
418
433
  }
419
434
  }
420
435
 
421
- private static class ProcessPluginSet
436
+ protected static class ProcessPluginSet
422
437
  {
423
438
  private final PluginType inputPluginType;
424
439
  private final PluginType outputPluginType;
@@ -507,7 +522,7 @@ public class BulkLoader
507
522
  final ExecutorPlugin exec = newExecutorPlugin(task);
508
523
  final ProcessPluginSet plugins = new ProcessPluginSet(task);
509
524
 
510
- final LoaderState state = new LoaderState(Exec.getLogger(BulkLoader.class), plugins);
525
+ final LoaderState state = newLoaderState(Exec.getLogger(BulkLoader.class), plugins);
511
526
  state.setTransactionStage(TransactionStage.INPUT_BEGIN);
512
527
  try {
513
528
  ConfigDiff inputConfigDiff = plugins.getInputPlugin().transaction(task.getInputConfig(), new InputPlugin.Control() {
@@ -565,7 +580,11 @@ public class BulkLoader
565
580
  return state.buildExecuteResult();
566
581
 
567
582
  } catch (Throwable ex) {
568
- if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
583
+ if (isSkippedTransaction(ex)) {
584
+ ConfigDiff configDiff = ((SkipTransactionException) ex).getConfigDiff();
585
+ return state.buildExecuteResultOfSkippedExecution(configDiff);
586
+ }
587
+ else if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
569
588
  // ignore the exception
570
589
  return state.buildExecuteResultWithWarningException(ex);
571
590
  }
@@ -580,7 +599,7 @@ public class BulkLoader
580
599
  final ExecutorPlugin exec = newExecutorPlugin(task);
581
600
  final ProcessPluginSet plugins = new ProcessPluginSet(task);
582
601
 
583
- final LoaderState state = new LoaderState(Exec.getLogger(BulkLoader.class), plugins);
602
+ final LoaderState state = newLoaderState(Exec.getLogger(BulkLoader.class), plugins);
584
603
  state.setTransactionStage(TransactionStage.INPUT_BEGIN);
585
604
  try {
586
605
  ConfigDiff inputConfigDiff = plugins.getInputPlugin().resume(resume.getInputTaskSource(), resume.getInputSchema(), resume.getInputTaskReports().size(), new InputPlugin.Control() {
@@ -642,7 +661,11 @@ public class BulkLoader
642
661
  return state.buildExecuteResult();
643
662
 
644
663
  } catch (Throwable ex) {
645
- if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
664
+ if (isSkippedTransaction(ex)) {
665
+ ConfigDiff configDiff = ((SkipTransactionException) ex).getConfigDiff();
666
+ return state.buildExecuteResultOfSkippedExecution(configDiff);
667
+ }
668
+ else if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
646
669
  // ignore the exception
647
670
  return state.buildExecuteResultWithWarningException(ex);
648
671
  }
@@ -650,6 +673,11 @@ public class BulkLoader
650
673
  }
651
674
  }
652
675
 
676
+ private static boolean isSkippedTransaction(Throwable ex)
677
+ {
678
+ return ex instanceof SkipTransactionException;
679
+ }
680
+
653
681
  private static void restoreResumedTaskReports(ResumeState resume, LoaderState state)
654
682
  {
655
683
  int inputTaskCount = resume.getInputTaskReports().size();
@@ -26,6 +26,8 @@ public class ExecModule
26
26
  {
27
27
  Preconditions.checkNotNull(binder, "binder is null.");
28
28
 
29
+ binder.bind(BulkLoader.class);
30
+
29
31
  binder.bind(ILoggerFactory.class).toProvider(LoggerProvider.class).in(Scopes.SINGLETON);
30
32
  binder.bind(ModelManager.class).in(Scopes.SINGLETON);
31
33
  binder.bind(BufferAllocator.class).to(PooledBufferAllocator.class).in(Scopes.SINGLETON);
@@ -6,11 +6,13 @@ import org.embulk.config.ConfigDiff;
6
6
  public class ExecutionResult
7
7
  {
8
8
  private final ConfigDiff configDiff;
9
+ private final boolean skipped;
9
10
  private final List<Throwable> ignoredExceptions;
10
11
 
11
- public ExecutionResult(ConfigDiff configDiff, List<Throwable> ignoredExceptions)
12
+ public ExecutionResult(ConfigDiff configDiff, boolean skipped, List<Throwable> ignoredExceptions)
12
13
  {
13
14
  this.configDiff = configDiff;
15
+ this.skipped = skipped;
14
16
  this.ignoredExceptions = ignoredExceptions;
15
17
  }
16
18
 
@@ -19,6 +21,11 @@ public class ExecutionResult
19
21
  return configDiff;
20
22
  }
21
23
 
24
+ public boolean isSkipped()
25
+ {
26
+ return skipped;
27
+ }
28
+
22
29
  public List<Throwable> getIgnoredExceptions()
23
30
  {
24
31
  return ignoredExceptions;
@@ -0,0 +1,23 @@
1
+ package org.embulk.exec;
2
+
3
+ import org.embulk.config.ConfigDiff;
4
+
5
+ // Input/output plugins might need to stop Embulk before the transaction starts by depending
6
+ // on the conditions of input/output data sources/destinations. They can throw this exception
7
+ // if they want to do that. Embulk handles it and then stops the transaction.
8
+ public class SkipTransactionException
9
+ extends RuntimeException
10
+ {
11
+ private final ConfigDiff configDiff;
12
+
13
+ public SkipTransactionException(ConfigDiff configDiff)
14
+ {
15
+ super();
16
+ this.configDiff = configDiff;
17
+ }
18
+
19
+ public ConfigDiff getConfigDiff()
20
+ {
21
+ return configDiff;
22
+ }
23
+ }
@@ -1,12 +1,12 @@
1
1
  Configuration
2
- ==================================
2
+ ==============
3
3
 
4
4
  .. contents::
5
5
  :local:
6
6
  :depth: 2
7
7
 
8
8
  Embulk configuration file format
9
- ------------------
9
+ ---------------------------------
10
10
 
11
11
  Embulk uses a YAML file to define a bulk data loading. Here is an example of the file:
12
12
 
@@ -60,7 +60,7 @@ In many cases, what you need to write is **in:**, **out**: and **formatter** sec
60
60
 
61
61
 
62
62
  Using variables
63
- ~~~~~~~~~~~~~~~~~~
63
+ ~~~~~~~~~~~~~~~~
64
64
 
65
65
  You can embed environment variables in configuration file using `Liquid template engine <http://liquidmarkup.org/>`_ (This is experimental feature. Behavior might change or be removed in future releases).
66
66
 
@@ -89,7 +89,7 @@ Environment variables are set to ``env`` variable.
89
89
 
90
90
 
91
91
  Including files
92
- ~~~~~~~~~~~~~~~~~~
92
+ ~~~~~~~~~~~~~~~~
93
93
 
94
94
  Configuration file can include another configuration file. To use it, configuration file name must end with ``.yml.liquid``.
95
95
 
@@ -121,12 +121,12 @@ With above 2 files, actual configuration file will be:
121
121
 
122
122
 
123
123
  Local file input plugin
124
- ------------------
124
+ ------------------------
125
125
 
126
126
  The ``file`` input plugin reads files from local file system.
127
127
 
128
128
  Options
129
- ~~~~~~~~~~~~~~~~~~
129
+ ~~~~~~~~
130
130
 
131
131
  +----------------+----------+------------------------------------------------+-----------+
132
132
  | name | type | description | required? |
@@ -168,7 +168,7 @@ For example, if you set ``last_path: /path/to/files/sample_02.csv``, Embulk read
168
168
  |-- sample_04.csv -> read
169
169
 
170
170
  Example
171
- ~~~~~~~~~~~~~~~~~~
171
+ ~~~~~~~~
172
172
 
173
173
  .. code-block:: yaml
174
174
 
@@ -187,7 +187,7 @@ CSV parser plugin
187
187
  The ``csv`` parser plugin parses CSV and TSV files.
188
188
 
189
189
  Options
190
- ~~~~~~~~~~~~~~~~~~
190
+ ~~~~~~~~
191
191
 
192
192
  +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
193
193
  | name | type | description | required? |
@@ -271,7 +271,7 @@ The ``null_string`` option converts certain values to NULL. Values will be conve
271
271
  You can use ``guess`` to automatically generate the column settings. See also `Quick Start <https://github.com/embulk/embulk#quick-start>`_.
272
272
 
273
273
  Example
274
- ~~~~~~~~~~~~~~~~~~
274
+ ~~~~~~~~
275
275
 
276
276
  .. code-block:: yaml
277
277
 
@@ -295,8 +295,30 @@ Example
295
295
  - {name: comment, type: string}
296
296
 
297
297
 
298
+ .. note::
299
+
300
+ CSV parser supports ``format: '%s'`` to parse UNIX timestamp in seconds (e.g. 1470148959) as timestamp.
301
+
302
+ However, CSV parser itself can't parse UNIX timestamp in millisecond (e.g. 1470148959542) as timestamp. You can still parse the column as ``long`` type first, then apply `timestamp_format <https://github.com/sonots/embulk-filter-timestamp_format>`_ filter plugin to convert long to timestamp. Here is an example:
303
+
304
+ .. code-block:: yaml
305
+
306
+ in:
307
+ type: file
308
+ path_prefix: /my_csv_files
309
+ parser:
310
+ ...
311
+ columns:
312
+ - {name: timestamp_in_seconds, type: timestamp, format: '%s'}
313
+ - {name: timestamp_in_millis, type: long}
314
+ filters:
315
+ - type: timestamp_format
316
+ columns:
317
+ - {name: timestamp_in_millis, from_unit: ms}
318
+
319
+
298
320
  JSON parser plugin
299
- ------------------
321
+ -------------------
300
322
 
301
323
  The ``json`` parser plugin parses a JSON file that contains a sequence of JSON objects. Example:
302
324
 
@@ -310,7 +332,7 @@ The ``json`` parser plugin parses a JSON file that contains a sequence of JSON o
310
332
  ``json`` parser plugin outputs a single record named "record" (type is json).
311
333
 
312
334
  Options
313
- ~~~~~~~~~~~~~~~~~~
335
+ ~~~~~~~~
314
336
 
315
337
  +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
316
338
  | name | type | description | required? |
@@ -320,7 +342,7 @@ Options
320
342
 
321
343
 
322
344
  Example
323
- ~~~~~~~~~~~~~~~~~~
345
+ ~~~~~~~~
324
346
 
325
347
  .. code-block:: yaml
326
348
 
@@ -329,17 +351,17 @@ Example
329
351
  type: json
330
352
 
331
353
  Gzip decoder plugin
332
- ------------------
354
+ --------------------
333
355
 
334
356
  The ``gzip`` decoder plugin decompresses gzip files before input plugins read them.
335
357
 
336
358
  Options
337
- ~~~~~~~~~~~~~~~~~~
359
+ ~~~~~~~~
338
360
 
339
361
  This plugin doesn't have any options.
340
362
 
341
363
  Example
342
- ~~~~~~~~~~~~~~~~~~
364
+ ~~~~~~~~
343
365
 
344
366
  .. code-block:: yaml
345
367
 
@@ -350,17 +372,17 @@ Example
350
372
 
351
373
 
352
374
  BZip2 decoder plugin
353
- ------------------
375
+ ---------------------
354
376
 
355
377
  The ``bzip2`` decoder plugin decompresses bzip2 files before input plugins read them.
356
378
 
357
379
  Options
358
- ~~~~~~~~~~~~~~~~~~
380
+ ~~~~~~~~
359
381
 
360
382
  This plugin doesn't have any options.
361
383
 
362
384
  Example
363
- ~~~~~~~~~~~~~~~~~~
385
+ ~~~~~~~~
364
386
 
365
387
  .. code-block:: yaml
366
388
 
@@ -371,12 +393,12 @@ Example
371
393
 
372
394
 
373
395
  File output plugin
374
- ------------------
396
+ -------------------
375
397
 
376
398
  The ``file`` output plugin writes records to local file system.
377
399
 
378
400
  Options
379
- ~~~~~~~~~~~~~~~~~~
401
+ ~~~~~~~~
380
402
 
381
403
  +--------------------+----------+---------------------------------------------------+----------------------------+
382
404
  | name | type | description | required? |
@@ -404,7 +426,7 @@ For example, if you set ``path_prefix: /path/to/output/sample_``, ``sequence_for
404
426
  ``sequence_format`` formats task index and sequence number in a task.
405
427
 
406
428
  Example
407
- ~~~~~~~~~~~~~~~~~~
429
+ ~~~~~~~~
408
430
 
409
431
  .. code-block:: yaml
410
432
 
@@ -416,12 +438,12 @@ Example
416
438
  ...
417
439
 
418
440
  CSV formatter plugin
419
- ------------------
441
+ ---------------------
420
442
 
421
443
  The ``csv`` formatter plugin formats records using CSV or TSV format.
422
444
 
423
445
  Options
424
- ~~~~~~~~~~~~~~~~~~
446
+ ~~~~~~~~
425
447
 
426
448
  +----------------------+---------+-------------------------------------------------------------------------------------------------------+-------------------------------+
427
449
  | name | type | description | required? |
@@ -474,7 +496,7 @@ The ``column_options`` option is a map whose keys are name of columns, and value
474
496
  +----------------------+---------+-------------------------------------------------------------------------------------------------------+-----------------------------------------+
475
497
 
476
498
  Example
477
- ~~~~~~~~~~~~~~~~~~
499
+ ~~~~~~~~
478
500
 
479
501
  .. code-block:: yaml
480
502
 
@@ -496,12 +518,12 @@ Example
496
518
  mycol2: {format: '%Y-%m-%d %H:%M:%S', timezone: 'America/Los_Angeles'}
497
519
 
498
520
  Gzip encoder plugin
499
- ------------------
521
+ --------------------
500
522
 
501
523
  The ``gzip`` encoder plugin compresses output files using gzip.
502
524
 
503
525
  Options
504
- ~~~~~~~~~~~~~~~~~~
526
+ ~~~~~~~~
505
527
 
506
528
  +---------+----------+----------------------------------------------------------------------+--------------------+
507
529
  | name | type | description | required? |
@@ -510,7 +532,7 @@ Options
510
532
  +---------+----------+----------------------------------------------------------------------+--------------------+
511
533
 
512
534
  Example
513
- ~~~~~~~~~~~~~~~~~~
535
+ ~~~~~~~~
514
536
 
515
537
  .. code-block:: yaml
516
538
 
@@ -521,73 +543,285 @@ Example
521
543
  level: 1
522
544
 
523
545
 
524
- Gzip encoder plugin
525
- ------------------
546
+ BZip2 encoder plugin
547
+ ---------------------
526
548
 
527
- The ``gzip`` encoder plugin compresses output files using gzip.
549
+ The ``bzip2`` encoder plugin compresses output files using bzip2.
528
550
 
529
551
  Options
530
- ~~~~~~~~~~~~~~~~~~
552
+ ~~~~~~~~
531
553
 
532
554
  +---------+----------+----------------------------------------------------------------------+--------------------+
533
555
  | name | type | description | required? |
534
556
  +=========+==========+======================================================================+====================+
535
- | level | integer | Compression level. From 0 (no compression) to 9 (best compression). | ``6`` by default |
557
+ | level | integer | Compression level. From 1 to 9 (best compression). | ``9`` by default |
536
558
  +---------+----------+----------------------------------------------------------------------+--------------------+
537
559
 
538
560
  Example
539
- ~~~~~~~~~~~~~~~~~~
561
+ ~~~~~~~~
540
562
 
541
563
  .. code-block:: yaml
542
564
 
543
565
  out:
544
566
  ...
545
567
  encoders:
546
- - type: gzip
547
- level: 1
568
+ - type: bzip2
569
+ level: 6
548
570
 
549
- BZip2 encoder plugin
550
- ------------------
551
571
 
552
- The ``bzip2`` encoder plugin compresses output files using bzip2.
572
+ Rename filter plugin
573
+ ---------------------
574
+
575
+ The ``rename`` filter plugin changes column names. This plugin has no impact on performance.
553
576
 
554
577
  Options
555
- ~~~~~~~~~~~~~~~~~~
578
+ ~~~~~~~~
556
579
 
557
580
  +---------+----------+----------------------------------------------------------------------+--------------------+
558
581
  | name | type | description | required? |
559
582
  +=========+==========+======================================================================+====================+
560
- | level | integer | Compression level. From 1 to 9 (best compression). | ``9`` by default |
583
+ | rules | array | An array of rule-based renaming operations. (See below for rules.) | ``[]`` by default |
584
+ +---------+----------+----------------------------------------------------------------------+--------------------+
585
+ | columns | hash | A map whose keys are existing column names. values are new names. | ``{}`` by default |
561
586
  +---------+----------+----------------------------------------------------------------------+--------------------+
562
587
 
588
+ Renaming rules
589
+ ~~~~~~~~~~~~~~~
590
+
591
+ The ``rules`` is an array of rules as below applied top-down for all the columns.
592
+
593
+ +-------------------------+----------------------------------------------------------------------------------------+
594
+ | rule | description |
595
+ +=========================+========================================================================================+
596
+ | character\_types | Restrict characters by types. Replace restricted characteres. |
597
+ +-------------------------+----------------------------------------------------------------------------------------+
598
+ | first\_character\_types | Restrict the first character by types. Prefix or replace first restricted characters. |
599
+ +-------------------------+----------------------------------------------------------------------------------------+
600
+ | lower\_to\_upper | Convert lower-case alphabets to upper-case. |
601
+ +-------------------------+----------------------------------------------------------------------------------------+
602
+ | regex\_replace | Replace with a regular expressions. |
603
+ +-------------------------+----------------------------------------------------------------------------------------+
604
+ | truncate | Truncate. |
605
+ +-------------------------+----------------------------------------------------------------------------------------+
606
+ | upper\_to\_lower | Convert upper-case alphabets to lower-case |
607
+ +-------------------------+----------------------------------------------------------------------------------------+
608
+ | unique\_number\_suffix | Make column names unique in the schema. |
609
+ +-------------------------+----------------------------------------------------------------------------------------+
610
+
611
+ Renaming rule: character\_types
612
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
613
+
614
+ The rule ``character_types`` replaces restricted characters.
615
+
616
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
617
+ | option | description | required? |
618
+ +===================+============================================================================================================================================+====================+
619
+ | pass\_characteres | Characters to be allowed. | ``""`` by default |
620
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
621
+ | pass\_types | Sets of characters to be allowed. The array must consist of "a-z" (lower-case alphabets), "A-Z" (upper-case alphabets), or "0-9" (digits). | ``[]`` by default |
622
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
623
+ | replace | A character that disallowed characters are replaced with. It must consist of just 1 character. | ``"_"`` by default |
624
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
625
+
563
626
  Example
564
- ~~~~~~~~~~~~~~~~~~
627
+ """"""""
565
628
 
566
629
  .. code-block:: yaml
567
630
 
568
- out:
631
+ # This configuration replaces characters into "_" except for "_", lower-case alphabets, and digits.
632
+ filters:
569
633
  ...
570
- encoders:
571
- - type: bzip2
572
- level: 6
634
+ - type: rename
635
+ rules:
636
+ - rule: character_types
637
+ pass_characters: "_"
638
+ pass_types: [ "a-z", "0-9" ]
573
639
 
574
640
 
575
- Rename filter plugin
576
- ------------------
641
+ Renaming rule: first\_character\_types
642
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
577
643
 
578
- The ``rename`` filter plugin changes column names. This plugin has no impact on performance.
644
+ The rule ``first_character_types`` prefixes or replaces a restricted character at the beginning.
579
645
 
580
- Options
581
- ~~~~~~~~~~~~~~~~~~
646
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
647
+ | option | description | required? |
648
+ +===================+============================================================================================================================================+==============================================+
649
+ | pass\_characteres | Characters to be allowed. | ``""`` by default |
650
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
651
+ | pass\_types | Sets of characters to be allowed. The array must consist of "a-z" (lower-case alphabets), "A-Z" (upper-case alphabets), or "0-9" (digits). | ``[]`` by default |
652
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
653
+ | prefix | A character that a disallowed first character is replaced with. | one of ``prefix`` or ``replace`` is required |
654
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
655
+ | replace | A character that a disallowed first character is prefixed with. | one of ``prefix`` or ``replace`` is required |
656
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
582
657
 
583
- +---------+----------+----------------------------------------------------------------------+--------------------+
584
- | name | type | description | required? |
585
- +=========+==========+======================================================================+====================+
586
- | columns | hash | A map whose keys are existing column names. values are new names. | ``{}`` by default |
587
- +---------+----------+----------------------------------------------------------------------+--------------------+
658
+ Example
659
+ """"""""
660
+
661
+ .. code-block:: yaml
662
+
663
+ # This configuration prefixes a column name with "_" unless the name starts from "_" or a lower-case alphabet.
664
+ filters:
665
+ ...
666
+ - type: rename
667
+ rules:
668
+ - rule: first_character_types
669
+ pass_characters: "_"
670
+ pass_types: [ "a-z" ]
671
+ prefix: "_"
672
+
673
+ Renaming rule: lower\_to\_upper
674
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
675
+
676
+ The rule ``lower_to_upper`` converts lower-case alphabets to upper-case.
677
+
678
+ Example
679
+ """"""""
680
+
681
+ .. code-block:: yaml
682
+
683
+ # This configuration converts all lower-case alphabets to upper-case.
684
+ filters:
685
+ ...
686
+ - type: rename
687
+ rules:
688
+ - rule: lower_to_upper
689
+
690
+
691
+ Renaming rule: regex\_replace
692
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
693
+
694
+ The rule ``regex_replace`` replaces column names based on a regular expression.
695
+
696
+ +---------+--------------------------------------------------------------------------------------------------------------------------------------+-----------+
697
+ | option | description | required? |
698
+ +=========+======================================================================================================================================+===========+
699
+ | match | A `Java-style regular expression <https://docs.oracle.com/javase/tutorial/essential/regex/>`_ to which this string is to be matched. | required |
700
+ +---------+--------------------------------------------------------------------------------------------------------------------------------------+-----------+
701
+ | replace | A string to be substibuted for each match in Java-style. | required |
702
+ +---------+--------------------------------------------------------------------------------------------------------------------------------------+-----------+
703
+
704
+ Example
705
+ """"""""
706
+
707
+ .. code-block:: yaml
708
+
709
+ # This configuration replaces all patterns
710
+ filters:
711
+ ...
712
+ - type: rename
713
+ rules:
714
+ - rule: regex_replace
715
+ match: "([0-9]+)_dollars"
716
+ replace: "USD$1"
717
+
718
+
719
+ Renaming rule: truncate
720
+ ^^^^^^^^^^^^^^^^^^^^^^^^
721
+
722
+ The rule ``truncate`` truncates column names.
723
+
724
+ +------------+-----------------------------------------------------+--------------------+
725
+ | option | description | required? |
726
+ +============+=====================================================+====================+
727
+ | max_length | The length to which the column names are truncated. | ``128`` by default |
728
+ +------------+-----------------------------------------------------+--------------------+
729
+
730
+ Example
731
+ """"""""
732
+
733
+ .. code-block:: yaml
734
+
735
+ # This configuration drops all characters after the 20th character.
736
+ filters:
737
+ ...
738
+ - type: rename
739
+ rules:
740
+ - rule: truncate
741
+ max_length: 20
742
+
743
+ Renaming rule: upper\_to\_lower
744
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
745
+
746
+ The rule ``upper_to_lower`` converts upper-case alphabets to lower-case.
747
+
748
+ Example
749
+ """"""""
750
+
751
+ .. code-block:: yaml
752
+
753
+ # This configuration converts all upper-case alphabets to lower-case.
754
+ filters:
755
+ ...
756
+ - type: rename
757
+ rules:
758
+ - rule: upper_to_lower
759
+
760
+ Renaming rule: unique\_number\_suffix
761
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
762
+
763
+ The rule ``unique_number_suffix`` makes column names unique in the schema by suffixing numbers.
764
+
765
+ +------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
766
+ | option | description | required? |
767
+ +============+=============================================================================================================================+====================+
768
+ | delimiter | A delimiter character inserted before a suffix number. It must be just 1 non-digit character. | ``"_"`` by default |
769
+ +------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
770
+ | digits | An integer that specifies the number of zero-filled digits of a suffix number. The suffix number zero-filled to the digits. | optional |
771
+ +------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
772
+ | max_length | The length to which the column names are truncated. The column name is truncated before the suffix number. | optional |
773
+ +------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
774
+ | offset | An integer where the suffix number starts. The first duplicative column name is suffixed by (```offset``` + 1). | ``1`` by default |
775
+ +------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
776
+
777
+ .. hint::
778
+ The procedure to make column names unique is not very trivial. There are many feasible ways. This renaming rule works as follows:
779
+
780
+ Basic policies:
781
+
782
+ * Suffix numbers are counted per original column name.
783
+ * Column names are fixed from the first column to the last column.
784
+
785
+ Actual procedure applied from the first (leftmost) column to the last (rightmost) column:
786
+
787
+ 1. Fix the column name as-is with truncating if the truncated name is not duplicated with left columns.
788
+ 2. Suffix the column name otherwise.
789
+
790
+ a. Try to append the suffix number for the original column name with truncating.
791
+ b. Fix it if the suffixed name is not duplicated with left columns nor original columns.
792
+ c. Retry (a) with the suffix number increased otherwise.
588
793
 
589
794
  Example
590
- ~~~~~~~~~~~~~~~~~~
795
+ """"""""
796
+
797
+ .. code-block:: yaml
798
+
799
+ # This configuration suffixes numbers to duplicative column names. (Ex. ["column", "column", "column"] goes to ["column", "column_2", "column_3"].)
800
+ filters:
801
+ ...
802
+ - type: rename
803
+ rules:
804
+ - rule: unique_number_suffix
805
+
806
+ Example of renaming rules
807
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~
808
+
809
+ .. code-block:: yaml
810
+
811
+ filters:
812
+ ...
813
+ - type: rename
814
+ rules:
815
+ - rule: upper_to_lower # All upper-case are converted to lower-case.
816
+ - rule: character_types # Only lower-case, digits and "_" are allowed. (No upper-case by the rule ahove.)
817
+ pass_types: [ "a-z", "0-9" ]
818
+ pass_characters: "_"
819
+ - rule: unique_number_suffix # Ensure all column names are unique.
820
+
821
+ Columns: not recommended
822
+ ~~~~~~~~~~~~~~~~~~~~~~~~~
823
+
824
+ ``columns`` is not recommended to use anymore. Consider using ``rules`` instead.
591
825
 
592
826
  .. code-block:: yaml
593
827
 
@@ -598,13 +832,16 @@ Example
598
832
  my_existing_column1: new_column1
599
833
  my_existing_column2: new_column2
600
834
 
835
+ .. hint::
836
+ ``columns`` are applied before ``rules`` if ``columns`` and ``rules`` are specified together. (It is discouraged to specify them together, though.)
837
+
601
838
  Local executor plugin
602
- ------------------
839
+ ----------------------
603
840
 
604
841
  The ``local`` executor plugin runs tasks using local threads. This is the only built-in executor plugin.
605
842
 
606
843
  Options
607
- ~~~~~~~~~~~~~~~~~~
844
+ ~~~~~~~~
608
845
 
609
846
  +------------------+----------+----------------------------------------------------------------------+--------------------------------------+
610
847
  | name | type | description | required? |
@@ -620,7 +857,7 @@ The ``max_threads`` option controls maximum concurrency. Setting smaller number
620
857
  The ``min_output_tasks`` option enables "page scattering". The feature is enabled if number of input tasks is less than ``min_output_tasks``. It uses multiple filter & output threads for each input task so that one input task can use multiple threads. Setting larger number here is useful if embulk doesn't use multi-threading with enough concurrency due to too few number of input tasks. Setting 1 here disables page scattering completely.
621
858
 
622
859
  Example
623
- ~~~~~~~~~~~~~~~~~~
860
+ ~~~~~~~~
624
861
 
625
862
  .. code-block:: yaml
626
863
 
@@ -635,12 +872,12 @@ Example
635
872
  ...
636
873
 
637
874
  Guess executor
638
- ------------------
875
+ ---------------
639
876
 
640
877
  The guess executor is called by ``guess`` command. It executes default guess plugins in a sequential order and suggests Embulk config by appropriate guess plugin. The default guess plugins and the order are ``gzip``, ``'bzip2``, ``json`` and ``csv``.
641
878
 
642
879
  Options
643
- ~~~~~~~~~~~~~~~~~~
880
+ ~~~~~~~~
644
881
 
645
882
  +-----------------------+----------+----------------------------------------------------------------------+--------------------------------------+
646
883
  | name | type | description | required? |
@@ -657,7 +894,7 @@ The ``exclude_guess_plugins`` option exclude specified guess plugins from the li
657
894
  This example shows how to use ``csv_all_strings`` guess plugin, which suggests column types within CSV files as string types. It needs to be explicitly specified by users when it's used instead of ``csv`` guess plugin because the plugin is not included in default guess plugins. We also can exclude default ``csv`` guess plugin.
658
895
 
659
896
  Example
660
- ~~~~~~~~~~~~~~~~~~
897
+ ~~~~~~~~
661
898
 
662
899
  .. code-block:: yaml
663
900