embulk 0.8.14-java → 0.8.15-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 353b653ad68c876426f467ef64c82698598e9a59
4
- data.tar.gz: 78a19ab05e47c9334267c02e4b4ea7ce2a0a3467
3
+ metadata.gz: c47e67c0b46d1a8dcd84337dee0d3479fd37f45f
4
+ data.tar.gz: edcd6bafc9095768db0798dcae9c0d53532f5054
5
5
  SHA512:
6
- metadata.gz: 527b918b1809e5e04cbe5eaf51efd589d753589aa1569168dd8801cc89a680388aa2ef13c4b3041a99ced2637a9ff2751db3248eca05ae34a5ec3e6228af46b7
7
- data.tar.gz: cf8ffc06d1e43b2b85c68d9f815b99265c2b48e65460ff97b05a8f6e73297d197390f5e7a850a01ff5482415d725a31bcca3ffd769c38ffac84492c7b1afd60a
6
+ metadata.gz: 1946063a900859720ba33919583f1bcfabb543af948b02260f3f91664445c2754aa2014e5b9c3c41cddd800ecb238b2450c39a0520d825d4f03d9d693f7d04f3
7
+ data.tar.gz: 8348fd5001511a3b718668eccb8b2686fd6cc2cf9635b38f2c941f79b20aa185c83bc01bb535559782895ead8a52b5cdb823c341a177afed7af1dff568e57001
data/README.md CHANGED
@@ -204,7 +204,7 @@ Run following commands and follow its instruction:
204
204
 
205
205
  ```
206
206
  ./gradlew releaseCheck
207
- ./gradlew release
207
+ ./gradlew clean cli gem && ./gradlew release
208
208
  git commit -am v$VERSION
209
209
  git tag v$VERSION
210
210
  ```
@@ -11,12 +11,12 @@ apply plugin: "com.github.jruby-gradle.jar"
11
11
  apply plugin: 'com.jfrog.bintray'
12
12
  apply plugin: 'com.github.johnrengelman.shadow'
13
13
 
14
- def java_projects = [project(":embulk-core"), project(":embulk-standards"), project(":embulk-cli")]
15
- def release_projects = [project(":embulk-core"), project(":embulk-standards")]
14
+ def java_projects = [project(":embulk-core"), project(":embulk-standards"), project(":embulk-cli"), project(":embulk-test")]
15
+ def release_projects = [project(":embulk-core"), project(":embulk-standards"), project(":embulk-test")]
16
16
 
17
17
  allprojects {
18
18
  group = 'org.embulk'
19
- version = '0.8.14'
19
+ version = '0.8.15'
20
20
 
21
21
  ext {
22
22
  jrubyVersion = '9.1.5.0'
@@ -126,7 +126,7 @@ public class EmbulkEmbed
126
126
  {
127
127
  this.injector = injector;
128
128
  injector.getInstance(org.slf4j.ILoggerFactory.class);
129
- this.bulkLoader = new BulkLoader(injector, systemConfig);
129
+ this.bulkLoader = injector.getInstance(BulkLoader.class);
130
130
  this.guessExecutor = injector.getInstance(GuessExecutor.class);
131
131
  this.previewExecutor = injector.getInstance(PreviewExecutor.class);
132
132
  }
@@ -62,7 +62,7 @@ public class BulkLoader
62
62
  this.injector = injector;
63
63
  }
64
64
 
65
- private static class LoaderState
65
+ protected static class LoaderState
66
66
  implements ProcessState
67
67
  {
68
68
  private final Logger logger;
@@ -343,7 +343,17 @@ public class BulkLoader
343
343
  ignoredExceptions.add(ex);
344
344
  }
345
345
 
346
- return new ExecutionResult(configDiff, ignoredExceptions.build());
346
+ return new ExecutionResult(configDiff, false, ignoredExceptions.build());
347
+ }
348
+
349
+ public ExecutionResult buildExecuteResultOfSkippedExecution(ConfigDiff configDiff)
350
+ {
351
+ ImmutableList.Builder<Throwable> ignoredExceptions = ImmutableList.builder();
352
+ for (Throwable e : getExceptions()) {
353
+ ignoredExceptions.add(e);
354
+ }
355
+
356
+ return new ExecutionResult(configDiff, true, ignoredExceptions.build());
347
357
  }
348
358
 
349
359
  public ResumeState buildResumeState(ExecSession exec)
@@ -364,6 +374,11 @@ public class BulkLoader
364
374
  }
365
375
  }
366
376
 
377
+ protected LoaderState newLoaderState(Logger logger, ProcessPluginSet plugins)
378
+ {
379
+ return new LoaderState(logger, plugins);
380
+ }
381
+
367
382
  public ExecutionResult run(ExecSession exec, final ConfigSource config)
368
383
  {
369
384
  try {
@@ -418,7 +433,7 @@ public class BulkLoader
418
433
  }
419
434
  }
420
435
 
421
- private static class ProcessPluginSet
436
+ protected static class ProcessPluginSet
422
437
  {
423
438
  private final PluginType inputPluginType;
424
439
  private final PluginType outputPluginType;
@@ -507,7 +522,7 @@ public class BulkLoader
507
522
  final ExecutorPlugin exec = newExecutorPlugin(task);
508
523
  final ProcessPluginSet plugins = new ProcessPluginSet(task);
509
524
 
510
- final LoaderState state = new LoaderState(Exec.getLogger(BulkLoader.class), plugins);
525
+ final LoaderState state = newLoaderState(Exec.getLogger(BulkLoader.class), plugins);
511
526
  state.setTransactionStage(TransactionStage.INPUT_BEGIN);
512
527
  try {
513
528
  ConfigDiff inputConfigDiff = plugins.getInputPlugin().transaction(task.getInputConfig(), new InputPlugin.Control() {
@@ -565,7 +580,11 @@ public class BulkLoader
565
580
  return state.buildExecuteResult();
566
581
 
567
582
  } catch (Throwable ex) {
568
- if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
583
+ if (isSkippedTransaction(ex)) {
584
+ ConfigDiff configDiff = ((SkipTransactionException) ex).getConfigDiff();
585
+ return state.buildExecuteResultOfSkippedExecution(configDiff);
586
+ }
587
+ else if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
569
588
  // ignore the exception
570
589
  return state.buildExecuteResultWithWarningException(ex);
571
590
  }
@@ -580,7 +599,7 @@ public class BulkLoader
580
599
  final ExecutorPlugin exec = newExecutorPlugin(task);
581
600
  final ProcessPluginSet plugins = new ProcessPluginSet(task);
582
601
 
583
- final LoaderState state = new LoaderState(Exec.getLogger(BulkLoader.class), plugins);
602
+ final LoaderState state = newLoaderState(Exec.getLogger(BulkLoader.class), plugins);
584
603
  state.setTransactionStage(TransactionStage.INPUT_BEGIN);
585
604
  try {
586
605
  ConfigDiff inputConfigDiff = plugins.getInputPlugin().resume(resume.getInputTaskSource(), resume.getInputSchema(), resume.getInputTaskReports().size(), new InputPlugin.Control() {
@@ -642,7 +661,11 @@ public class BulkLoader
642
661
  return state.buildExecuteResult();
643
662
 
644
663
  } catch (Throwable ex) {
645
- if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
664
+ if (isSkippedTransaction(ex)) {
665
+ ConfigDiff configDiff = ((SkipTransactionException) ex).getConfigDiff();
666
+ return state.buildExecuteResultOfSkippedExecution(configDiff);
667
+ }
668
+ else if (state.isAllTasksCommitted() && state.isAllTransactionsCommitted()) {
646
669
  // ignore the exception
647
670
  return state.buildExecuteResultWithWarningException(ex);
648
671
  }
@@ -650,6 +673,11 @@ public class BulkLoader
650
673
  }
651
674
  }
652
675
 
676
+ private static boolean isSkippedTransaction(Throwable ex)
677
+ {
678
+ return ex instanceof SkipTransactionException;
679
+ }
680
+
653
681
  private static void restoreResumedTaskReports(ResumeState resume, LoaderState state)
654
682
  {
655
683
  int inputTaskCount = resume.getInputTaskReports().size();
@@ -26,6 +26,8 @@ public class ExecModule
26
26
  {
27
27
  Preconditions.checkNotNull(binder, "binder is null.");
28
28
 
29
+ binder.bind(BulkLoader.class);
30
+
29
31
  binder.bind(ILoggerFactory.class).toProvider(LoggerProvider.class).in(Scopes.SINGLETON);
30
32
  binder.bind(ModelManager.class).in(Scopes.SINGLETON);
31
33
  binder.bind(BufferAllocator.class).to(PooledBufferAllocator.class).in(Scopes.SINGLETON);
@@ -6,11 +6,13 @@ import org.embulk.config.ConfigDiff;
6
6
  public class ExecutionResult
7
7
  {
8
8
  private final ConfigDiff configDiff;
9
+ private final boolean skipped;
9
10
  private final List<Throwable> ignoredExceptions;
10
11
 
11
- public ExecutionResult(ConfigDiff configDiff, List<Throwable> ignoredExceptions)
12
+ public ExecutionResult(ConfigDiff configDiff, boolean skipped, List<Throwable> ignoredExceptions)
12
13
  {
13
14
  this.configDiff = configDiff;
15
+ this.skipped = skipped;
14
16
  this.ignoredExceptions = ignoredExceptions;
15
17
  }
16
18
 
@@ -19,6 +21,11 @@ public class ExecutionResult
19
21
  return configDiff;
20
22
  }
21
23
 
24
+ public boolean isSkipped()
25
+ {
26
+ return skipped;
27
+ }
28
+
22
29
  public List<Throwable> getIgnoredExceptions()
23
30
  {
24
31
  return ignoredExceptions;
@@ -0,0 +1,23 @@
1
+ package org.embulk.exec;
2
+
3
+ import org.embulk.config.ConfigDiff;
4
+
5
+ // Input/output plugins might need to stop Embulk before the transaction starts by depending
6
+ // on the conditions of input/output data sources/destinations. They can throw this exception
7
+ // if they want to do that. Embulk handles it and then stops the transaction.
8
+ public class SkipTransactionException
9
+ extends RuntimeException
10
+ {
11
+ private final ConfigDiff configDiff;
12
+
13
+ public SkipTransactionException(ConfigDiff configDiff)
14
+ {
15
+ super();
16
+ this.configDiff = configDiff;
17
+ }
18
+
19
+ public ConfigDiff getConfigDiff()
20
+ {
21
+ return configDiff;
22
+ }
23
+ }
@@ -1,12 +1,12 @@
1
1
  Configuration
2
- ==================================
2
+ ==============
3
3
 
4
4
  .. contents::
5
5
  :local:
6
6
  :depth: 2
7
7
 
8
8
  Embulk configuration file format
9
- ------------------
9
+ ---------------------------------
10
10
 
11
11
  Embulk uses a YAML file to define a bulk data loading. Here is an example of the file:
12
12
 
@@ -60,7 +60,7 @@ In many cases, what you need to write is **in:**, **out**: and **formatter** sec
60
60
 
61
61
 
62
62
  Using variables
63
- ~~~~~~~~~~~~~~~~~~
63
+ ~~~~~~~~~~~~~~~~
64
64
 
65
65
  You can embed environment variables in configuration file using `Liquid template engine <http://liquidmarkup.org/>`_ (This is experimental feature. Behavior might change or be removed in future releases).
66
66
 
@@ -89,7 +89,7 @@ Environment variables are set to ``env`` variable.
89
89
 
90
90
 
91
91
  Including files
92
- ~~~~~~~~~~~~~~~~~~
92
+ ~~~~~~~~~~~~~~~~
93
93
 
94
94
  Configuration file can include another configuration file. To use it, configuration file name must end with ``.yml.liquid``.
95
95
 
@@ -121,12 +121,12 @@ With above 2 files, actual configuration file will be:
121
121
 
122
122
 
123
123
  Local file input plugin
124
- ------------------
124
+ ------------------------
125
125
 
126
126
  The ``file`` input plugin reads files from local file system.
127
127
 
128
128
  Options
129
- ~~~~~~~~~~~~~~~~~~
129
+ ~~~~~~~~
130
130
 
131
131
  +----------------+----------+------------------------------------------------+-----------+
132
132
  | name | type | description | required? |
@@ -168,7 +168,7 @@ For example, if you set ``last_path: /path/to/files/sample_02.csv``, Embulk read
168
168
  |-- sample_04.csv -> read
169
169
 
170
170
  Example
171
- ~~~~~~~~~~~~~~~~~~
171
+ ~~~~~~~~
172
172
 
173
173
  .. code-block:: yaml
174
174
 
@@ -187,7 +187,7 @@ CSV parser plugin
187
187
  The ``csv`` parser plugin parses CSV and TSV files.
188
188
 
189
189
  Options
190
- ~~~~~~~~~~~~~~~~~~
190
+ ~~~~~~~~
191
191
 
192
192
  +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+---------------------------+
193
193
  | name | type | description | required? |
@@ -271,7 +271,7 @@ The ``null_string`` option converts certain values to NULL. Values will be conve
271
271
  You can use ``guess`` to automatically generate the column settings. See also `Quick Start <https://github.com/embulk/embulk#quick-start>`_.
272
272
 
273
273
  Example
274
- ~~~~~~~~~~~~~~~~~~
274
+ ~~~~~~~~
275
275
 
276
276
  .. code-block:: yaml
277
277
 
@@ -295,8 +295,30 @@ Example
295
295
  - {name: comment, type: string}
296
296
 
297
297
 
298
+ .. note::
299
+
300
+ CSV parser supports ``format: '%s'`` to parse UNIX timestamp in seconds (e.g. 1470148959) as timestamp.
301
+
302
+ However, CSV parser itself can't parse UNIX timestamp in millisecond (e.g. 1470148959542) as timestamp. You can still parse the column as ``long`` type first, then apply `timestamp_format <https://github.com/sonots/embulk-filter-timestamp_format>`_ filter plugin to convert long to timestamp. Here is an example:
303
+
304
+ .. code-block:: yaml
305
+
306
+ in:
307
+ type: file
308
+ path_prefix: /my_csv_files
309
+ parser:
310
+ ...
311
+ columns:
312
+ - {name: timestamp_in_seconds, type: timestamp, format: '%s'}
313
+ - {name: timestamp_in_millis, type: long}
314
+ filters:
315
+ - type: timestamp_format
316
+ columns:
317
+ - {name: timestamp_in_millis, from_unit: ms}
318
+
319
+
298
320
  JSON parser plugin
299
- ------------------
321
+ -------------------
300
322
 
301
323
  The ``json`` parser plugin parses a JSON file that contains a sequence of JSON objects. Example:
302
324
 
@@ -310,7 +332,7 @@ The ``json`` parser plugin parses a JSON file that contains a sequence of JSON o
310
332
  ``json`` parser plugin outputs a single record named "record" (type is json).
311
333
 
312
334
  Options
313
- ~~~~~~~~~~~~~~~~~~
335
+ ~~~~~~~~
314
336
 
315
337
  +----------------------------+----------+----------------------------------------------------------------------------------------------------------------+------------------------+
316
338
  | name | type | description | required? |
@@ -320,7 +342,7 @@ Options
320
342
 
321
343
 
322
344
  Example
323
- ~~~~~~~~~~~~~~~~~~
345
+ ~~~~~~~~
324
346
 
325
347
  .. code-block:: yaml
326
348
 
@@ -329,17 +351,17 @@ Example
329
351
  type: json
330
352
 
331
353
  Gzip decoder plugin
332
- ------------------
354
+ --------------------
333
355
 
334
356
  The ``gzip`` decoder plugin decompresses gzip files before input plugins read them.
335
357
 
336
358
  Options
337
- ~~~~~~~~~~~~~~~~~~
359
+ ~~~~~~~~
338
360
 
339
361
  This plugin doesn't have any options.
340
362
 
341
363
  Example
342
- ~~~~~~~~~~~~~~~~~~
364
+ ~~~~~~~~
343
365
 
344
366
  .. code-block:: yaml
345
367
 
@@ -350,17 +372,17 @@ Example
350
372
 
351
373
 
352
374
  BZip2 decoder plugin
353
- ------------------
375
+ ---------------------
354
376
 
355
377
  The ``bzip2`` decoder plugin decompresses bzip2 files before input plugins read them.
356
378
 
357
379
  Options
358
- ~~~~~~~~~~~~~~~~~~
380
+ ~~~~~~~~
359
381
 
360
382
  This plugin doesn't have any options.
361
383
 
362
384
  Example
363
- ~~~~~~~~~~~~~~~~~~
385
+ ~~~~~~~~
364
386
 
365
387
  .. code-block:: yaml
366
388
 
@@ -371,12 +393,12 @@ Example
371
393
 
372
394
 
373
395
  File output plugin
374
- ------------------
396
+ -------------------
375
397
 
376
398
  The ``file`` output plugin writes records to local file system.
377
399
 
378
400
  Options
379
- ~~~~~~~~~~~~~~~~~~
401
+ ~~~~~~~~
380
402
 
381
403
  +--------------------+----------+---------------------------------------------------+----------------------------+
382
404
  | name | type | description | required? |
@@ -404,7 +426,7 @@ For example, if you set ``path_prefix: /path/to/output/sample_``, ``sequence_for
404
426
  ``sequence_format`` formats task index and sequence number in a task.
405
427
 
406
428
  Example
407
- ~~~~~~~~~~~~~~~~~~
429
+ ~~~~~~~~
408
430
 
409
431
  .. code-block:: yaml
410
432
 
@@ -416,12 +438,12 @@ Example
416
438
  ...
417
439
 
418
440
  CSV formatter plugin
419
- ------------------
441
+ ---------------------
420
442
 
421
443
  The ``csv`` formatter plugin formats records using CSV or TSV format.
422
444
 
423
445
  Options
424
- ~~~~~~~~~~~~~~~~~~
446
+ ~~~~~~~~
425
447
 
426
448
  +----------------------+---------+-------------------------------------------------------------------------------------------------------+-------------------------------+
427
449
  | name | type | description | required? |
@@ -474,7 +496,7 @@ The ``column_options`` option is a map whose keys are name of columns, and value
474
496
  +----------------------+---------+-------------------------------------------------------------------------------------------------------+-----------------------------------------+
475
497
 
476
498
  Example
477
- ~~~~~~~~~~~~~~~~~~
499
+ ~~~~~~~~
478
500
 
479
501
  .. code-block:: yaml
480
502
 
@@ -496,12 +518,12 @@ Example
496
518
  mycol2: {format: '%Y-%m-%d %H:%M:%S', timezone: 'America/Los_Angeles'}
497
519
 
498
520
  Gzip encoder plugin
499
- ------------------
521
+ --------------------
500
522
 
501
523
  The ``gzip`` encoder plugin compresses output files using gzip.
502
524
 
503
525
  Options
504
- ~~~~~~~~~~~~~~~~~~
526
+ ~~~~~~~~
505
527
 
506
528
  +---------+----------+----------------------------------------------------------------------+--------------------+
507
529
  | name | type | description | required? |
@@ -510,7 +532,7 @@ Options
510
532
  +---------+----------+----------------------------------------------------------------------+--------------------+
511
533
 
512
534
  Example
513
- ~~~~~~~~~~~~~~~~~~
535
+ ~~~~~~~~
514
536
 
515
537
  .. code-block:: yaml
516
538
 
@@ -521,73 +543,285 @@ Example
521
543
  level: 1
522
544
 
523
545
 
524
- Gzip encoder plugin
525
- ------------------
546
+ BZip2 encoder plugin
547
+ ---------------------
526
548
 
527
- The ``gzip`` encoder plugin compresses output files using gzip.
549
+ The ``bzip2`` encoder plugin compresses output files using bzip2.
528
550
 
529
551
  Options
530
- ~~~~~~~~~~~~~~~~~~
552
+ ~~~~~~~~
531
553
 
532
554
  +---------+----------+----------------------------------------------------------------------+--------------------+
533
555
  | name | type | description | required? |
534
556
  +=========+==========+======================================================================+====================+
535
- | level | integer | Compression level. From 0 (no compression) to 9 (best compression). | ``6`` by default |
557
+ | level | integer | Compression level. From 1 to 9 (best compression). | ``9`` by default |
536
558
  +---------+----------+----------------------------------------------------------------------+--------------------+
537
559
 
538
560
  Example
539
- ~~~~~~~~~~~~~~~~~~
561
+ ~~~~~~~~
540
562
 
541
563
  .. code-block:: yaml
542
564
 
543
565
  out:
544
566
  ...
545
567
  encoders:
546
- - type: gzip
547
- level: 1
568
+ - type: bzip2
569
+ level: 6
548
570
 
549
- BZip2 encoder plugin
550
- ------------------
551
571
 
552
- The ``bzip2`` encoder plugin compresses output files using bzip2.
572
+ Rename filter plugin
573
+ ---------------------
574
+
575
+ The ``rename`` filter plugin changes column names. This plugin has no impact on performance.
553
576
 
554
577
  Options
555
- ~~~~~~~~~~~~~~~~~~
578
+ ~~~~~~~~
556
579
 
557
580
  +---------+----------+----------------------------------------------------------------------+--------------------+
558
581
  | name | type | description | required? |
559
582
  +=========+==========+======================================================================+====================+
560
- | level | integer | Compression level. From 1 to 9 (best compression). | ``9`` by default |
583
+ | rules | array | An array of rule-based renaming operations. (See below for rules.) | ``[]`` by default |
584
+ +---------+----------+----------------------------------------------------------------------+--------------------+
585
+ | columns | hash | A map whose keys are existing column names. values are new names. | ``{}`` by default |
561
586
  +---------+----------+----------------------------------------------------------------------+--------------------+
562
587
 
588
+ Renaming rules
589
+ ~~~~~~~~~~~~~~~
590
+
591
+ The ``rules`` is an array of rules as below applied top-down for all the columns.
592
+
593
+ +-------------------------+----------------------------------------------------------------------------------------+
594
+ | rule | description |
595
+ +=========================+========================================================================================+
596
+ | character\_types | Restrict characters by types. Replace restricted characteres. |
597
+ +-------------------------+----------------------------------------------------------------------------------------+
598
+ | first\_character\_types | Restrict the first character by types. Prefix or replace first restricted characters. |
599
+ +-------------------------+----------------------------------------------------------------------------------------+
600
+ | lower\_to\_upper | Convert lower-case alphabets to upper-case. |
601
+ +-------------------------+----------------------------------------------------------------------------------------+
602
+ | regex\_replace | Replace with a regular expressions. |
603
+ +-------------------------+----------------------------------------------------------------------------------------+
604
+ | truncate | Truncate. |
605
+ +-------------------------+----------------------------------------------------------------------------------------+
606
+ | upper\_to\_lower | Convert upper-case alphabets to lower-case |
607
+ +-------------------------+----------------------------------------------------------------------------------------+
608
+ | unique\_number\_suffix | Make column names unique in the schema. |
609
+ +-------------------------+----------------------------------------------------------------------------------------+
610
+
611
+ Renaming rule: character\_types
612
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
613
+
614
+ The rule ``character_types`` replaces restricted characters.
615
+
616
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
617
+ | option | description | required? |
618
+ +===================+============================================================================================================================================+====================+
619
+ | pass\_characteres | Characters to be allowed. | ``""`` by default |
620
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
621
+ | pass\_types | Sets of characters to be allowed. The array must consist of "a-z" (lower-case alphabets), "A-Z" (upper-case alphabets), or "0-9" (digits). | ``[]`` by default |
622
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
623
+ | replace | A character that disallowed characters are replaced with. It must consist of just 1 character. | ``"_"`` by default |
624
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+--------------------+
625
+
563
626
  Example
564
- ~~~~~~~~~~~~~~~~~~
627
+ """"""""
565
628
 
566
629
  .. code-block:: yaml
567
630
 
568
- out:
631
+ # This configuration replaces characters into "_" except for "_", lower-case alphabets, and digits.
632
+ filters:
569
633
  ...
570
- encoders:
571
- - type: bzip2
572
- level: 6
634
+ - type: rename
635
+ rules:
636
+ - rule: character_types
637
+ pass_characters: "_"
638
+ pass_types: [ "a-z", "0-9" ]
573
639
 
574
640
 
575
- Rename filter plugin
576
- ------------------
641
+ Renaming rule: first\_character\_types
642
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
577
643
 
578
- The ``rename`` filter plugin changes column names. This plugin has no impact on performance.
644
+ The rule ``first_character_types`` prefixes or replaces a restricted character at the beginning.
579
645
 
580
- Options
581
- ~~~~~~~~~~~~~~~~~~
646
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
647
+ | option | description | required? |
648
+ +===================+============================================================================================================================================+==============================================+
649
+ | pass\_characteres | Characters to be allowed. | ``""`` by default |
650
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
651
+ | pass\_types | Sets of characters to be allowed. The array must consist of "a-z" (lower-case alphabets), "A-Z" (upper-case alphabets), or "0-9" (digits). | ``[]`` by default |
652
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
653
+ | prefix | A character that a disallowed first character is replaced with. | one of ``prefix`` or ``replace`` is required |
654
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
655
+ | replace | A character that a disallowed first character is prefixed with. | one of ``prefix`` or ``replace`` is required |
656
+ +-------------------+--------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------+
582
657
 
583
- +---------+----------+----------------------------------------------------------------------+--------------------+
584
- | name | type | description | required? |
585
- +=========+==========+======================================================================+====================+
586
- | columns | hash | A map whose keys are existing column names. values are new names. | ``{}`` by default |
587
- +---------+----------+----------------------------------------------------------------------+--------------------+
658
+ Example
659
+ """"""""
660
+
661
+ .. code-block:: yaml
662
+
663
+ # This configuration prefixes a column name with "_" unless the name starts from "_" or a lower-case alphabet.
664
+ filters:
665
+ ...
666
+ - type: rename
667
+ rules:
668
+ - rule: first_character_types
669
+ pass_characters: "_"
670
+ pass_types: [ "a-z" ]
671
+ prefix: "_"
672
+
673
+ Renaming rule: lower\_to\_upper
674
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
675
+
676
+ The rule ``lower_to_upper`` converts lower-case alphabets to upper-case.
677
+
678
+ Example
679
+ """"""""
680
+
681
+ .. code-block:: yaml
682
+
683
+ # This configuration converts all lower-case alphabets to upper-case.
684
+ filters:
685
+ ...
686
+ - type: rename
687
+ rules:
688
+ - rule: lower_to_upper
689
+
690
+
691
+ Renaming rule: regex\_replace
692
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
693
+
694
+ The rule ``regex_replace`` replaces column names based on a regular expression.
695
+
696
+ +---------+--------------------------------------------------------------------------------------------------------------------------------------+-----------+
697
+ | option | description | required? |
698
+ +=========+======================================================================================================================================+===========+
699
+ | match | A `Java-style regular expression <https://docs.oracle.com/javase/tutorial/essential/regex/>`_ to which this string is to be matched. | required |
700
+ +---------+--------------------------------------------------------------------------------------------------------------------------------------+-----------+
701
+ | replace | A string to be substibuted for each match in Java-style. | required |
702
+ +---------+--------------------------------------------------------------------------------------------------------------------------------------+-----------+
703
+
704
+ Example
705
+ """"""""
706
+
707
+ .. code-block:: yaml
708
+
709
+ # This configuration replaces all patterns
710
+ filters:
711
+ ...
712
+ - type: rename
713
+ rules:
714
+ - rule: regex_replace
715
+ match: "([0-9]+)_dollars"
716
+ replace: "USD$1"
717
+
718
+
719
+ Renaming rule: truncate
720
+ ^^^^^^^^^^^^^^^^^^^^^^^^
721
+
722
+ The rule ``truncate`` truncates column names.
723
+
724
+ +------------+-----------------------------------------------------+--------------------+
725
+ | option | description | required? |
726
+ +============+=====================================================+====================+
727
+ | max_length | The length to which the column names are truncated. | ``128`` by default |
728
+ +------------+-----------------------------------------------------+--------------------+
729
+
730
+ Example
731
+ """"""""
732
+
733
+ .. code-block:: yaml
734
+
735
+ # This configuration drops all characters after the 20th character.
736
+ filters:
737
+ ...
738
+ - type: rename
739
+ rules:
740
+ - rule: truncate
741
+ max_length: 20
742
+
743
+ Renaming rule: upper\_to\_lower
744
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
745
+
746
+ The rule ``upper_to_lower`` converts upper-case alphabets to lower-case.
747
+
748
+ Example
749
+ """"""""
750
+
751
+ .. code-block:: yaml
752
+
753
+ # This configuration converts all upper-case alphabets to lower-case.
754
+ filters:
755
+ ...
756
+ - type: rename
757
+ rules:
758
+ - rule: upper_to_lower
759
+
760
+ Renaming rule: unique\_number\_suffix
761
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
762
+
763
+ The rule ``unique_number_suffix`` makes column names unique in the schema by suffixing numbers.
764
+
765
+ +------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
766
+ | option | description | required? |
767
+ +============+=============================================================================================================================+====================+
768
+ | delimiter | A delimiter character inserted before a suffix number. It must be just 1 non-digit character. | ``"_"`` by default |
769
+ +------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
770
+ | digits | An integer that specifies the number of zero-filled digits of a suffix number. The suffix number zero-filled to the digits. | optional |
771
+ +------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
772
+ | max_length | The length to which the column names are truncated. The column name is truncated before the suffix number. | optional |
773
+ +------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
774
+ | offset | An integer where the suffix number starts. The first duplicative column name is suffixed by (```offset``` + 1). | ``1`` by default |
775
+ +------------+-----------------------------------------------------------------------------------------------------------------------------+--------------------+
776
+
777
+ .. hint::
778
+ The procedure to make column names unique is not very trivial. There are many feasible ways. This renaming rule works as follows:
779
+
780
+ Basic policies:
781
+
782
+ * Suffix numbers are counted per original column name.
783
+ * Column names are fixed from the first column to the last column.
784
+
785
+ Actual procedure applied from the first (leftmost) column to the last (rightmost) column:
786
+
787
+ 1. Fix the column name as-is with truncating if the truncated name is not duplicated with left columns.
788
+ 2. Suffix the column name otherwise.
789
+
790
+ a. Try to append the suffix number for the original column name with truncating.
791
+ b. Fix it if the suffixed name is not duplicated with left columns nor original columns.
792
+ c. Retry (a) with the suffix number increased otherwise.
588
793
 
589
794
  Example
590
- ~~~~~~~~~~~~~~~~~~
795
+ """"""""
796
+
797
+ .. code-block:: yaml
798
+
799
+ # This configuration suffixes numbers to duplicative column names. (Ex. ["column", "column", "column"] goes to ["column", "column_2", "column_3"].)
800
+ filters:
801
+ ...
802
+ - type: rename
803
+ rules:
804
+ - rule: unique_number_suffix
805
+
806
+ Example of renaming rules
807
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~
808
+
809
+ .. code-block:: yaml
810
+
811
+ filters:
812
+ ...
813
+ - type: rename
814
+ rules:
815
+ - rule: upper_to_lower # All upper-case are converted to lower-case.
816
+ - rule: character_types # Only lower-case, digits and "_" are allowed. (No upper-case by the rule ahove.)
817
+ pass_types: [ "a-z", "0-9" ]
818
+ pass_characters: "_"
819
+ - rule: unique_number_suffix # Ensure all column names are unique.
820
+
821
+ Columns: not recommended
822
+ ~~~~~~~~~~~~~~~~~~~~~~~~~
823
+
824
+ ``columns`` is not recommended to use anymore. Consider using ``rules`` instead.
591
825
 
592
826
  .. code-block:: yaml
593
827
 
@@ -598,13 +832,16 @@ Example
598
832
  my_existing_column1: new_column1
599
833
  my_existing_column2: new_column2
600
834
 
835
+ .. hint::
836
+ ``columns`` are applied before ``rules`` if ``columns`` and ``rules`` are specified together. (It is discouraged to specify them together, though.)
837
+
601
838
  Local executor plugin
602
- ------------------
839
+ ----------------------
603
840
 
604
841
  The ``local`` executor plugin runs tasks using local threads. This is the only built-in executor plugin.
605
842
 
606
843
  Options
607
- ~~~~~~~~~~~~~~~~~~
844
+ ~~~~~~~~
608
845
 
609
846
  +------------------+----------+----------------------------------------------------------------------+--------------------------------------+
610
847
  | name | type | description | required? |
@@ -620,7 +857,7 @@ The ``max_threads`` option controls maximum concurrency. Setting smaller number
620
857
  The ``min_output_tasks`` option enables "page scattering". The feature is enabled if number of input tasks is less than ``min_output_tasks``. It uses multiple filter & output threads for each input task so that one input task can use multiple threads. Setting larger number here is useful if embulk doesn't use multi-threading with enough concurrency due to too few number of input tasks. Setting 1 here disables page scattering completely.
621
858
 
622
859
  Example
623
- ~~~~~~~~~~~~~~~~~~
860
+ ~~~~~~~~
624
861
 
625
862
  .. code-block:: yaml
626
863
 
@@ -635,12 +872,12 @@ Example
635
872
  ...
636
873
 
637
874
  Guess executor
638
- ------------------
875
+ ---------------
639
876
 
640
877
  The guess executor is called by ``guess`` command. It executes default guess plugins in a sequential order and suggests Embulk config by appropriate guess plugin. The default guess plugins and the order are ``gzip``, ``'bzip2``, ``json`` and ``csv``.
641
878
 
642
879
  Options
643
- ~~~~~~~~~~~~~~~~~~
880
+ ~~~~~~~~
644
881
 
645
882
  +-----------------------+----------+----------------------------------------------------------------------+--------------------------------------+
646
883
  | name | type | description | required? |
@@ -657,7 +894,7 @@ The ``exclude_guess_plugins`` option exclude specified guess plugins from the li
657
894
  This example shows how to use ``csv_all_strings`` guess plugin, which suggests column types within CSV files as string types. It needs to be explicitly specified by users when it's used instead of ``csv`` guess plugin because the plugin is not included in default guess plugins. We also can exclude default ``csv`` guess plugin.
658
895
 
659
896
  Example
660
- ~~~~~~~~~~~~~~~~~~
897
+ ~~~~~~~~
661
898
 
662
899
  .. code-block:: yaml
663
900