datacraft 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +118 -1
- data/lib/datacraft.rb +2 -0
- data/lib/datacraft/cli.rb +4 -3
- data/lib/datacraft/instruction.rb +0 -8
- data/lib/datacraft/parser.rb +14 -0
- data/lib/datacraft/runner.rb +2 -0
- data/lib/datacraft/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bc6e2cf4c695b0b7f6ff9544415a3bcf929ac3f6
|
4
|
+
data.tar.gz: 1c1ab63f47629926fd21a2c4a919defc6d030519
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 814dfe30c18efa74e89ef332e14159ef5077e344baee32bc870c18eb6b045456f35c51c7c3cbdcf1e1c18e1bd85c5f2babf96ac47192a70c04bd28ca02e1df7c
|
7
|
+
data.tar.gz: 12ac00e4e1ba23433cf4b78d570cf15921c3dc459fba9628b2913401020f58d5cf040230207551db61e82513304bbda0908a67ef3a58181930206af1cac0be39
|
data/README.md
CHANGED
@@ -7,6 +7,21 @@ Play with data like a Pro, and have fun like Minecraft.
|
|
7
7
|
[](https://gemnasium.com/xiaoxinghu/datacraft)
|
8
8
|
[](https://codeclimate.com/github/xiaoxinghu/datacraft)
|
9
9
|
|
10
|
+
## what is it
|
11
|
+
|
12
|
+
`Datacraft` is an ETL tool highly inspired by open source project [kiba](https://github.com/thbar/kiba). But we need certain advanced features which current kiba does not provide, so we decided to roll out our own version. Here are the major ones are already implemented:
|
13
|
+
|
14
|
+
- build action after loading data (explained later)
|
15
|
+
- parallel processing
|
16
|
+
- lazy initialization
|
17
|
+
|
18
|
+
And we need more. Here are some ideas bouncing around in my head and might become new features.
|
19
|
+
|
20
|
+
- data flow, pipelining
|
21
|
+
- dry run with analytic results
|
22
|
+
- snapshot the data on change points and generate report for intuitive debugging.
|
23
|
+
|
24
|
+
|
10
25
|
## Installation
|
11
26
|
|
12
27
|
Add this line to your application's Gemfile:
|
@@ -25,7 +40,109 @@ Or install it yourself as:
|
|
25
40
|
|
26
41
|
## Usage
|
27
42
|
|
28
|
-
|
43
|
+
Here is an example of instruction script.
|
44
|
+
|
45
|
+
```ruby
|
46
|
+
|
47
|
+
# data source class
|
48
|
+
class CsvFile
|
49
|
+
def initialize(csv_file)
|
50
|
+
@csv = CSV.open(csv_file, headers: true, header_converters: :symbol)
|
51
|
+
end
|
52
|
+
|
53
|
+
# mandatory method for data source class.
|
54
|
+
# Should always yield data rows.
|
55
|
+
def each
|
56
|
+
@csv.each do |row|
|
57
|
+
yield(row.to_hash)
|
58
|
+
end
|
59
|
+
@csv.close
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Data consumer class. Here we just output statistic
|
64
|
+
# information of the data set.
|
65
|
+
class ReportBuilder
|
66
|
+
def initialize
|
67
|
+
@total_employee = 0
|
68
|
+
@total_age = 0
|
69
|
+
end
|
70
|
+
|
71
|
+
# mandatory method for consume data
|
72
|
+
def <<(row)
|
73
|
+
@total_employee += 1
|
74
|
+
@total_age += row[:age].to_i
|
75
|
+
end
|
76
|
+
|
77
|
+
# optional method for build final product
|
78
|
+
def build
|
79
|
+
File.open('report.txt', 'w') do |f|
|
80
|
+
f.puts "Total Employee: #{@total_employee}"
|
81
|
+
f.puts "Total Age: #{@total_age}"
|
82
|
+
f.puts "Average Age: #{@total_age / @total_employee}" unless @total_employee == 0
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
retirement_age = 60
|
88
|
+
# define data source
|
89
|
+
from CsvFile, 'employees.csv'
|
90
|
+
|
91
|
+
total_rows = 0
|
92
|
+
# tweak each row
|
93
|
+
tweak do |row|
|
94
|
+
# eliminate the retired ones, return nil means to
|
95
|
+
# discard the data
|
96
|
+
total_rows += 1
|
97
|
+
row[:age].to_i < retirement_age ? row : nil
|
98
|
+
end
|
99
|
+
|
100
|
+
# define data consumer
|
101
|
+
to ReportBuilder
|
102
|
+
```
|
103
|
+
|
104
|
+
Run the script:
|
105
|
+
|
106
|
+
```bash
|
107
|
+
$ dcraft build inst.rb
|
108
|
+
```
|
109
|
+
|
110
|
+
### parallel processing
|
111
|
+
|
112
|
+
To improve the performance of the script, you can enable multithreading by setting the option.
|
113
|
+
|
114
|
+
```ruby
|
115
|
+
set :parallel, true # default is false
|
116
|
+
set :n_threads, 4 # default 8
|
117
|
+
```
|
118
|
+
|
119
|
+
Please notice, due to [GIL](https://en.wikipedia.org/wiki/Global_Interpreter_Lock), we are not able to take advantage of multicore system with parallel processing. But when the threads are block by heavy I/O, which is very common under this circumstance, then it can make considerable performance boost.
|
120
|
+
|
121
|
+
### benchmark
|
122
|
+
|
123
|
+
Sometimes you just want to know: **why is my script so slow?** In order to efficiently spot the bottleneck of your script, you can enable `benchmark mode`.
|
124
|
+
|
125
|
+
```ruby
|
126
|
+
set :benchmark, true
|
127
|
+
```
|
128
|
+
|
129
|
+
Then run your script, you will see the detailed benchmark information.
|
130
|
+
|
131
|
+
### hooks
|
132
|
+
|
133
|
+
Do something before or/and after the process, define hook blocks.
|
134
|
+
|
135
|
+
```ruby
|
136
|
+
pre_build do
|
137
|
+
# do something
|
138
|
+
end
|
139
|
+
|
140
|
+
post_build do
|
141
|
+
# do something
|
142
|
+
end
|
143
|
+
```
|
144
|
+
|
145
|
+
It is pretty self explanatory.
|
29
146
|
|
30
147
|
## Development
|
31
148
|
|
data/lib/datacraft.rb
CHANGED
@@ -3,8 +3,10 @@ require 'datacraft/exceptions'
|
|
3
3
|
require 'datacraft/definition'
|
4
4
|
require 'datacraft/context'
|
5
5
|
require 'datacraft/instruction'
|
6
|
+
require 'datacraft/parser'
|
6
7
|
require 'datacraft/runner'
|
7
8
|
|
8
9
|
module Datacraft
|
9
10
|
extend Datacraft::Runner
|
11
|
+
extend Datacraft::Parser
|
10
12
|
end
|
data/lib/datacraft/cli.rb
CHANGED
@@ -5,16 +5,17 @@ module Datacraft
|
|
5
5
|
class Cli < Thor
|
6
6
|
desc 'build [INSTRUCTION_FILE]', 'build the data by instruction'
|
7
7
|
def build(filename)
|
8
|
-
|
9
|
-
Datacraft.run instruction
|
8
|
+
Datacraft.run check(filename)
|
10
9
|
end
|
11
10
|
|
12
11
|
desc 'check [INSTRUCTION_FILE]',
|
13
12
|
'evaluate the instruction without running it'
|
14
13
|
def check(filename)
|
15
14
|
begin
|
16
|
-
|
15
|
+
script = IO.read(filename)
|
16
|
+
instruction = Datacraft.parse script
|
17
17
|
puts 'You are ready to go.'
|
18
|
+
instruction
|
18
19
|
rescue InvalidInstruction => e
|
19
20
|
puts e
|
20
21
|
end
|
@@ -33,13 +33,5 @@ module Datacraft
|
|
33
33
|
def set(key, value)
|
34
34
|
options[key.to_sym] = value
|
35
35
|
end
|
36
|
-
|
37
|
-
def self.from_file(filename)
|
38
|
-
script_content = IO.read(filename)
|
39
|
-
instruction = Instruction.new
|
40
|
-
instruction.instance_eval(script_content)
|
41
|
-
instruction.validate
|
42
|
-
instruction
|
43
|
-
end
|
44
36
|
end
|
45
37
|
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Datacraft
|
2
|
+
module Parser
|
3
|
+
def parse(script = nil, &block)
|
4
|
+
instruction = Instruction.new
|
5
|
+
if script
|
6
|
+
instruction.instance_eval(script)
|
7
|
+
else
|
8
|
+
instruction.instance_eval(&block)
|
9
|
+
end
|
10
|
+
instruction.validate
|
11
|
+
instruction
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/lib/datacraft/runner.rb
CHANGED
data/lib/datacraft/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: datacraft
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Xiaoxing Hu
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-09-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: thor
|
@@ -122,6 +122,7 @@ files:
|
|
122
122
|
- lib/datacraft/definition.rb
|
123
123
|
- lib/datacraft/exceptions.rb
|
124
124
|
- lib/datacraft/instruction.rb
|
125
|
+
- lib/datacraft/parser.rb
|
125
126
|
- lib/datacraft/runner.rb
|
126
127
|
- lib/datacraft/version.rb
|
127
128
|
homepage: https://github.com/xiaoxinghu/datacraft
|