metacrunch-file 1.1.0 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.circleci/config.yml +5 -6
- data/Gemfile +1 -1
- data/Readme.md +42 -12
- data/lib/metacrunch/file.rb +4 -0
- data/lib/metacrunch/file/csv_source.rb +35 -0
- data/lib/metacrunch/file/destination.rb +12 -0
- data/lib/metacrunch/file/file_destination.rb +36 -0
- data/lib/metacrunch/file/file_source.rb +56 -0
- data/lib/metacrunch/file/source.rb +3 -47
- data/lib/metacrunch/file/version.rb +1 -2
- data/lib/metacrunch/file/xlsx_destination.rb +11 -3
- data/metacrunch-file.gemspec +3 -2
- metadata +31 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 1a64cc33b8badb7d4e6b7c29267501db505edbb081ed55be3c4bdf269109eac8
|
4
|
+
data.tar.gz: 0bb18d835fb8e812ff04aded3b8bfc3d90f8daae606704c49d2dbfe502cdaafd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a28ff6fcb40f286673011ee27de86a15d579bdc213b379eb77781eddbd6efa5613e3d68bd16e1d016d61f9d786b143c35e2b276961affbad21d71707060c65a4
|
7
|
+
data.tar.gz: e58ec6254b11f956c4a45a0ef68477cee7b9bb2e79ce5fd283f5a3e0450e0e4b49c5c53796f7e6234ad9742d46fd42e19fed6bd2be231847b8de6a82cb188d57
|
data/.circleci/config.yml
CHANGED
@@ -1,12 +1,11 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
version: 2
|
1
|
+
version: 2.1
|
2
|
+
orbs:
|
3
|
+
ruby: circleci/ruby@1.1.1
|
4
|
+
|
6
5
|
jobs:
|
7
6
|
build:
|
8
7
|
docker:
|
9
|
-
- image: circleci/ruby:2.
|
8
|
+
- image: circleci/ruby:2.6-node-browsers
|
10
9
|
|
11
10
|
working_directory: ~/repo
|
12
11
|
|
data/Gemfile
CHANGED
data/Readme.md
CHANGED
@@ -6,7 +6,10 @@ metacrunch-file
|
|
6
6
|
[![Test Coverage](https://codeclimate.com/github/ubpb/metacrunch-file/badges/coverage.svg)](https://codeclimate.com/github/ubpb/metacrunch-file/coverage)
|
7
7
|
[![CircleCI](https://circleci.com/gh/ubpb/metacrunch-file.svg?style=svg)](https://circleci.com/gh/ubpb/metacrunch-file)
|
8
8
|
|
9
|
-
This is the official file package for the [metacrunch ETL toolkit](https://github.com/ubpb/metacrunch).
|
9
|
+
This is the official file package for the [metacrunch ETL toolkit](https://github.com/ubpb/metacrunch).
|
10
|
+
|
11
|
+
*Note: For working examples on how to use this package check out our [demo repository](https://github.com/ubpb/metacrunch-demo).*
|
12
|
+
|
10
13
|
|
11
14
|
Installation
|
12
15
|
------------
|
@@ -14,7 +17,7 @@ Installation
|
|
14
17
|
Include the gem in your `Gemfile`
|
15
18
|
|
16
19
|
```ruby
|
17
|
-
gem "metacrunch-file", "~> 1.
|
20
|
+
gem "metacrunch-file", "~> 1.4.0"
|
18
21
|
```
|
19
22
|
|
20
23
|
and run `$ bundle install` to install it.
|
@@ -29,24 +32,20 @@ $ gem install metacrunch-file
|
|
29
32
|
Usage
|
30
33
|
-----
|
31
34
|
|
32
|
-
|
33
|
-
|
34
|
-
### `Metacrunch::File::Source`
|
35
|
+
## `Metacrunch::File::FileSource`
|
35
36
|
|
36
37
|
This class provides a metacrunch `source` implementation that can be used to read data from files in the file system into a metacrunch job. The class can be used to read regular files, compressed files (gzip), tar archives and compressed tar archives (gzip).
|
37
38
|
|
38
|
-
You can access non-option arguments from the command line using the `ARGV` constant.
|
39
|
-
|
40
39
|
```ruby
|
41
40
|
# my_job.metacrunch
|
42
41
|
|
43
42
|
# If you call this example like so
|
44
43
|
# $ metacrunch my_job.metacrunch *.xml
|
45
44
|
# ARGV will contain all the XML files in the current directory.
|
46
|
-
source Metacrunch::File::
|
45
|
+
source Metacrunch::File::FileSource.new(ARGV)
|
47
46
|
|
48
47
|
# ... or you can set the filenames directly
|
49
|
-
source Metacrunch::File::
|
48
|
+
source Metacrunch::File::FileSource.new(["my-data.xml", "my-other-data.xml", "..."])
|
50
49
|
```
|
51
50
|
|
52
51
|
**Options**
|
@@ -67,12 +66,43 @@ transformation ->(file_entry) do
|
|
67
66
|
end
|
68
67
|
```
|
69
68
|
|
69
|
+
## `Metacrunch::File::FileDestination`
|
70
|
+
|
71
|
+
This class provides a metacrunch `destination` to write data to a file. Every data that gets passed to the destination is appended to the given file. If the data is an `Array` every element of that array is appended to the file. Non existing files will be created automatically.
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
# my_job.metacrunch
|
75
|
+
|
76
|
+
destination Metacrunch::File::FileDestination.new("/tmp/my-data.txt" [, OPTIONS])
|
77
|
+
```
|
78
|
+
|
79
|
+
**Options**
|
80
|
+
|
81
|
+
* `override_existing_file`: Overrides an existing file if set to `true`. If set to `false` an error is raised if the file already exists. Defaults to `false`.
|
82
|
+
|
83
|
+
## `Metacrunch::File::CSVSource`
|
84
|
+
|
85
|
+
This class provides a metacrunch `source` for reading CSV files. It is a simple wrapper around [smarter_csv](https://github.com/tilo/smarter_csv) gem.
|
86
|
+
|
87
|
+
```ruby
|
88
|
+
# my_job.metacrunch
|
89
|
+
|
90
|
+
source Metacrunch::File::CSVSource.new("my.csv" [, OPTIONS])
|
91
|
+
```
|
92
|
+
|
93
|
+
**Options**
|
94
|
+
|
95
|
+
* `headers`: Whether or not the file contains headers as the first line. Important if the file does not contain headers, otherwise you would lose the first line of data. Defaults to `true`.
|
96
|
+
* `col_sep`: Column separator. Defaults to `,`.
|
97
|
+
* `row_sep`: Row separator or record separator. Defaults to `\n`.
|
98
|
+
* `quote_char`: Quotation character. Defaults to `"`.
|
99
|
+
* `file_encoding`: Set the file encoding. Defaults to `utf-8`.
|
70
100
|
|
71
|
-
|
101
|
+
## `Metacrunch::File::XLSXDestination`
|
72
102
|
|
73
103
|
This class provides a metacrunch `destination` implementation to create simple Excel (xlsx) files.
|
74
104
|
|
75
|
-
To use this destination a transformation is required to format the data in a proper array that can be
|
105
|
+
To use this destination a transformation is required to format the data in a proper array that can be passed to the destination. When defining the destination you must provide an array of column names. Each data row passed to the destination must be an array of the same size as the column array.
|
76
106
|
|
77
107
|
```ruby
|
78
108
|
# my_job.metacrunch
|
@@ -81,7 +111,7 @@ transformation ->(data) do
|
|
81
111
|
[data["foo"], data["bar"], ...]
|
82
112
|
end
|
83
113
|
|
84
|
-
|
114
|
+
destination Metacrunch::File::XLSXDestination.new(
|
85
115
|
"/tmp/my-data.xlsx", # filename
|
86
116
|
["Column 1", "Column 2", ...], # header columns
|
87
117
|
OPTIONS
|
data/lib/metacrunch/file.rb
CHANGED
@@ -4,7 +4,11 @@ require "active_support/core_ext"
|
|
4
4
|
module Metacrunch
|
5
5
|
module File
|
6
6
|
require_relative "file/entry"
|
7
|
+
require_relative "file/csv_source"
|
8
|
+
require_relative "file/file_source"
|
7
9
|
require_relative "file/source"
|
10
|
+
require_relative "file/file_destination"
|
11
|
+
require_relative "file/destination"
|
8
12
|
require_relative "file/xlsx_destination"
|
9
13
|
end
|
10
14
|
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require "metacrunch/file"
|
2
|
+
require "smarter_csv"
|
3
|
+
|
4
|
+
module Metacrunch
|
5
|
+
class File::CSVSource
|
6
|
+
|
7
|
+
DEFAULT_OPTIONS = {
|
8
|
+
headers: true,
|
9
|
+
col_sep: ",",
|
10
|
+
row_sep: "\n",
|
11
|
+
quote_char: '"',
|
12
|
+
file_encoding: "utf-8"
|
13
|
+
}
|
14
|
+
|
15
|
+
def initialize(csv_filename, options = {})
|
16
|
+
@filename = csv_filename
|
17
|
+
@options = DEFAULT_OPTIONS.merge(options)
|
18
|
+
end
|
19
|
+
|
20
|
+
def each(&block)
|
21
|
+
return enum_for(__method__) unless block_given?
|
22
|
+
|
23
|
+
SmarterCSV.process(@filename, {
|
24
|
+
headers_in_file: @options[:headers],
|
25
|
+
col_sep: @options[:col_sep],
|
26
|
+
row_sep: @options[:row_sep],
|
27
|
+
quote_char: @options[:quote_char],
|
28
|
+
file_encoding: @options[:file_encoding]
|
29
|
+
}) do |line|
|
30
|
+
yield line
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require "metacrunch/file"
|
2
|
+
|
3
|
+
module Metacrunch
|
4
|
+
class File::Destination < File::FileDestination
|
5
|
+
|
6
|
+
def initialize(filename, options = {})
|
7
|
+
warn "[DEPRECATION] `Metacrunch::File::Destination` is deprecated. Please use `Metacrunch::File::FileDestination` instead."
|
8
|
+
super
|
9
|
+
end
|
10
|
+
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require "metacrunch/file"
|
2
|
+
|
3
|
+
module Metacrunch
|
4
|
+
class File::FileDestination
|
5
|
+
|
6
|
+
DEFAULT_OPTIONS = {
|
7
|
+
override_existing_file: false
|
8
|
+
}
|
9
|
+
|
10
|
+
def initialize(filename, options = {})
|
11
|
+
@filename = ::File.expand_path(filename)
|
12
|
+
@options = DEFAULT_OPTIONS.deep_merge(options)
|
13
|
+
|
14
|
+
if ::File.exists?(@filename) && @options[:override_existing_file] == false
|
15
|
+
raise "File `#{@filename}` exists but `override_existing_file` option was set to `false`"
|
16
|
+
end
|
17
|
+
|
18
|
+
@file = ::File.open(@filename, 'wb+')
|
19
|
+
end
|
20
|
+
|
21
|
+
def write(data)
|
22
|
+
return if data.blank?
|
23
|
+
|
24
|
+
if data.is_a?(Array)
|
25
|
+
data.each { |row| @file.write(row) }
|
26
|
+
else
|
27
|
+
@file.write(data)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def close
|
32
|
+
@file.close if @file
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require "metacrunch/file"
|
2
|
+
require "rubygems/package"
|
3
|
+
|
4
|
+
module Metacrunch
|
5
|
+
class File::FileSource
|
6
|
+
|
7
|
+
def initialize(filenames)
|
8
|
+
@filenames = [*filenames].map{|f| f.presence}.compact
|
9
|
+
end
|
10
|
+
|
11
|
+
def each(&block)
|
12
|
+
return enum_for(__method__) unless block_given?
|
13
|
+
|
14
|
+
@filenames.each do |filename|
|
15
|
+
if is_archive?(filename)
|
16
|
+
read_archive(filename, &block)
|
17
|
+
else
|
18
|
+
read_regular_file(filename, &block)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def is_archive?(filename)
|
26
|
+
filename.ends_with?(".tar") || filename.ends_with?(".tar.gz") || filename.ends_with?(".tgz")
|
27
|
+
end
|
28
|
+
|
29
|
+
def is_gzip_file?(filename)
|
30
|
+
filename.ends_with?(".gz") || filename.ends_with?(".tgz")
|
31
|
+
end
|
32
|
+
|
33
|
+
def read_regular_file(filename, &block)
|
34
|
+
if ::File.file?(filename)
|
35
|
+
io = is_gzip_file?(filename) ? Zlib::GzipReader.open(filename) : ::File.open(filename, "r")
|
36
|
+
yield File::Entry.new(filename: filename, archive_filename: nil, contents: io.read)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def read_archive(filename, &block)
|
41
|
+
io = is_gzip_file?(filename) ? Zlib::GzipReader.open(filename) : ::File.open(filename, "r")
|
42
|
+
tarReader = Gem::Package::TarReader.new(io)
|
43
|
+
|
44
|
+
tarReader.each do |_tar_entry|
|
45
|
+
if _tar_entry.file?
|
46
|
+
yield File::Entry.new(
|
47
|
+
filename: filename,
|
48
|
+
archive_filename: _tar_entry.full_name,
|
49
|
+
contents: _tar_entry.read
|
50
|
+
)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
@@ -1,55 +1,11 @@
|
|
1
1
|
require "metacrunch/file"
|
2
|
-
require "rubygems/package"
|
3
2
|
|
4
3
|
module Metacrunch
|
5
|
-
class File::Source
|
4
|
+
class File::Source < File::FileSource
|
6
5
|
|
7
6
|
def initialize(filenames)
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
def each(&block)
|
12
|
-
return enum_for(__method__) unless block_given?
|
13
|
-
|
14
|
-
@filenames.each do |filename|
|
15
|
-
if is_archive?(filename)
|
16
|
-
read_archive(filename, &block)
|
17
|
-
else
|
18
|
-
read_regular_file(filename, &block)
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
private
|
24
|
-
|
25
|
-
def is_archive?(filename)
|
26
|
-
filename.ends_with?(".tar") || filename.ends_with?(".tar.gz") || filename.ends_with?(".tgz")
|
27
|
-
end
|
28
|
-
|
29
|
-
def is_gzip_file?(filename)
|
30
|
-
filename.ends_with?(".gz") || filename.ends_with?(".tgz")
|
31
|
-
end
|
32
|
-
|
33
|
-
def read_regular_file(filename, &block)
|
34
|
-
if ::File.file?(filename)
|
35
|
-
io = is_gzip_file?(filename) ? Zlib::GzipReader.open(filename) : ::File.open(filename, "r")
|
36
|
-
yield File::Entry.new(filename: filename, archive_filename: nil, contents: io.read)
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
|
-
def read_archive(filename, &block)
|
41
|
-
io = is_gzip_file?(filename) ? Zlib::GzipReader.open(filename) : ::File.open(filename, "r")
|
42
|
-
tarReader = Gem::Package::TarReader.new(io)
|
43
|
-
|
44
|
-
tarReader.each do |_tar_entry|
|
45
|
-
if _tar_entry.file?
|
46
|
-
yield File::Entry.new(
|
47
|
-
filename: filename,
|
48
|
-
archive_filename: _tar_entry.full_name,
|
49
|
-
contents: _tar_entry.read
|
50
|
-
)
|
51
|
-
end
|
52
|
-
end
|
7
|
+
warn "[DEPRECATION] `Metacrunch::File::Source` is deprecated. Please use `Metacrunch::File::FileSource` instead."
|
8
|
+
super
|
53
9
|
end
|
54
10
|
|
55
11
|
end
|
@@ -20,9 +20,17 @@ module Metacrunch
|
|
20
20
|
@sheet.add_row(columns, types: :string)
|
21
21
|
end
|
22
22
|
|
23
|
-
def write(
|
24
|
-
return if
|
25
|
-
|
23
|
+
def write(data)
|
24
|
+
return if data.blank?
|
25
|
+
raise ArgumentError, "Data must be an Array" unless data.is_a?(Array)
|
26
|
+
|
27
|
+
if data.first.is_a?(Array)
|
28
|
+
data.each do |d|
|
29
|
+
@sheet.add_row(d, types: :string)
|
30
|
+
end
|
31
|
+
else
|
32
|
+
@sheet.add_row(data, types: :string)
|
33
|
+
end
|
26
34
|
end
|
27
35
|
|
28
36
|
def close
|
data/metacrunch-file.gemspec
CHANGED
@@ -15,6 +15,7 @@ Gem::Specification.new do |spec|
|
|
15
15
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
16
16
|
spec.require_paths = ["lib"]
|
17
17
|
|
18
|
-
spec.add_dependency "activesupport", ">=
|
19
|
-
spec.add_dependency "
|
18
|
+
spec.add_dependency "activesupport", ">= 0"
|
19
|
+
spec.add_dependency "caxlsx", "~> 3.0"
|
20
|
+
spec.add_dependency "smarter_csv", "~> 1.2"
|
20
21
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metacrunch-file
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- René Sprotte
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-10-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -16,30 +16,44 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: '0'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
26
|
+
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
|
-
name:
|
28
|
+
name: caxlsx
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version:
|
33
|
+
version: '3.0'
|
34
34
|
type: :runtime
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version:
|
41
|
-
|
42
|
-
|
40
|
+
version: '3.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: smarter_csv
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.2'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.2'
|
55
|
+
description:
|
56
|
+
email:
|
43
57
|
executables: []
|
44
58
|
extensions: []
|
45
59
|
extra_rdoc_files: []
|
@@ -53,7 +67,11 @@ files:
|
|
53
67
|
- Readme.md
|
54
68
|
- bin/console
|
55
69
|
- lib/metacrunch/file.rb
|
70
|
+
- lib/metacrunch/file/csv_source.rb
|
71
|
+
- lib/metacrunch/file/destination.rb
|
56
72
|
- lib/metacrunch/file/entry.rb
|
73
|
+
- lib/metacrunch/file/file_destination.rb
|
74
|
+
- lib/metacrunch/file/file_source.rb
|
57
75
|
- lib/metacrunch/file/source.rb
|
58
76
|
- lib/metacrunch/file/version.rb
|
59
77
|
- lib/metacrunch/file/xlsx_destination.rb
|
@@ -62,7 +80,7 @@ homepage: http://github.com/ubpb/metacrunch-file
|
|
62
80
|
licenses:
|
63
81
|
- MIT
|
64
82
|
metadata: {}
|
65
|
-
post_install_message:
|
83
|
+
post_install_message:
|
66
84
|
rdoc_options: []
|
67
85
|
require_paths:
|
68
86
|
- lib
|
@@ -77,9 +95,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
77
95
|
- !ruby/object:Gem::Version
|
78
96
|
version: '0'
|
79
97
|
requirements: []
|
80
|
-
|
81
|
-
|
82
|
-
signing_key:
|
98
|
+
rubygems_version: 3.1.4
|
99
|
+
signing_key:
|
83
100
|
specification_version: 4
|
84
101
|
summary: File package for the metacrunch ETL toolkit.
|
85
102
|
test_files: []
|