hog 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +22 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +173 -0
- data/Rakefile +2 -0
- data/bin/hog +5 -0
- data/example/runner.pig +13 -0
- data/example/tuple_processor.rb +29 -0
- data/example/tuple_processor_udf.rb +22 -0
- data/hog.gemspec +23 -0
- data/lib/hog/field.rb +52 -0
- data/lib/hog/testing/pig_stubs.rb +16 -0
- data/lib/hog/tuple.rb +41 -0
- data/lib/hog/utils.rb +12 -0
- data/lib/hog/version.rb +3 -0
- data/lib/hog.rb +19 -0
- metadata +95 -0
data/.gitignore
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
test/tmp
|
16
|
+
test/version_tmp
|
17
|
+
tmp
|
18
|
+
*.bundle
|
19
|
+
*.so
|
20
|
+
*.o
|
21
|
+
*.a
|
22
|
+
mkmf.log
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 jondot
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,173 @@
|
|
1
|
+
# Hog
|
2
|
+
|
3
|
+
Supercharge your Ruby Pig UDFs with Hog.
|
4
|
+
|
5
|
+
|
6
|
+
## Why Use Hog?
|
7
|
+
|
8
|
+
Use Hog when you want to process (map) tuples with Hadoop, and generally
|
9
|
+
belive that:
|
10
|
+
|
11
|
+
* It's too much overhead to write custom M/R code
|
12
|
+
* You're too lazy to process with Pig data pipes and sprinkled custom UDFs, and they're not powerful
|
13
|
+
enough anyway.
|
14
|
+
* Data processing with Hive queries is wrong
|
15
|
+
* Cascading is an overkill for your case
|
16
|
+
* Streaming Hadoop (unix processes) is the last resort
|
17
|
+
|
18
|
+
And instead want to develop within your familiar Ruby ecosystem but
|
19
|
+
also:
|
20
|
+
|
21
|
+
* Control and tweak job parameters via Pig
|
22
|
+
* Use Pig's job management
|
23
|
+
* Use Pig's I/O (serdes) easily
|
24
|
+
|
25
|
+
And you also want to use gems, jars, and everything Ruby (JRuby) has to offer for a really quick turn-around and pleasent development experience.
|
26
|
+
|
27
|
+
|
28
|
+
## Quickstart
|
29
|
+
|
30
|
+
Let's say you want to perform geolocation, detect a device's form
|
31
|
+
factor, and shove bits and fields from the nested structure into a Hive table to be able to query
|
32
|
+
it efficiently.
|
33
|
+
|
34
|
+
This is your raw event:
|
35
|
+
|
36
|
+
```javascript
|
37
|
+
{
|
38
|
+
"x-forwarded-for":"8.8.8.8",
|
39
|
+
"user-agent": "123",
|
40
|
+
"pageview": {
|
41
|
+
"userid":"ud-123",
|
42
|
+
"timestamp":123123123,
|
43
|
+
"headers":{
|
44
|
+
"user-agent":"...",
|
45
|
+
"cookies":"..."
|
46
|
+
}
|
47
|
+
}
|
48
|
+
}
|
49
|
+
```
|
50
|
+
|
51
|
+
To do this with Pig alone, you would need:
|
52
|
+
|
53
|
+
* A pig UDF for geolocation
|
54
|
+
* A pig UDF for form-factor detection
|
55
|
+
* A way to extract JSON fields from a tuple in Pig
|
56
|
+
* Combine everything in Pig code
|
57
|
+
|
58
|
+
And you'll give up
|
59
|
+
|
60
|
+
* Testability
|
61
|
+
* Maintainability
|
62
|
+
* Development speed
|
63
|
+
* Eventually, sanity
|
64
|
+
|
65
|
+
|
66
|
+
#### Using Hog
|
67
|
+
|
68
|
+
With Hog, you'll mostly feel you're _describing_ how you want the data to be
|
69
|
+
shaped, and it'll do the heavy lifting.
|
70
|
+
|
71
|
+
You always have the option to 'drop' to code with the `prepare`
|
72
|
+
block.
|
73
|
+
|
74
|
+
|
75
|
+
```ruby
|
76
|
+
# gem install hog
|
77
|
+
require 'hog'
|
78
|
+
|
79
|
+
TupleProcessor = Hog.tuple "mytuple" do
|
80
|
+
|
81
|
+
# Your own custom feature extraction. This can be any free style
|
82
|
+
# Ruby or Java code
|
83
|
+
#
|
84
|
+
prepare do |hsh|
|
85
|
+
loc = ip2location(hsh["x-forwarded-for"])
|
86
|
+
hsh.merge!(loc)
|
87
|
+
|
88
|
+
form_factor = formfactor(hsh["user-agent"])
|
89
|
+
hsh.merge!(form_factor)
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
# Describe your data columns (i.e. for Hive), and how you'd like
|
94
|
+
# to pull them out of your nested data.
|
95
|
+
#
|
96
|
+
# You can also put a fixed value, or serialize a sub-nested
|
97
|
+
# structure as-is for later use with Hive's json_object
|
98
|
+
#
|
99
|
+
chararray :id, :path => 'pageview.userid'
|
100
|
+
chararray :created_at, :path => 'pageview.timestamp', :with_nil => ''
|
101
|
+
float :sampling, :value => 1.0
|
102
|
+
chararray :ev_type, :value => 'pageview'
|
103
|
+
chararray :ev_info, :json => 'pageview.headers', :default => {}
|
104
|
+
end
|
105
|
+
```
|
106
|
+
|
107
|
+
As you'll notice - there's very little code, and you specify the bare
|
108
|
+
essentials -- which you would have had to specify (types and fields) with Pig in any way you'd try to use it.
|
109
|
+
|
110
|
+
|
111
|
+
Hog will generate a `TupleProcessor` that conforms to your description and logic.
|
112
|
+
|
113
|
+
|
114
|
+
#### Testing
|
115
|
+
|
116
|
+
To test your mapper, just use `TupleProcessor` within your specs and
|
117
|
+
give it a raw event. It's plain Ruby.
|
118
|
+
|
119
|
+
|
120
|
+
#### Hadoop
|
121
|
+
|
122
|
+
To hookup with Hadoop, rig `TupleProcessor` within a Pig JRuby UDF shell like so:
|
123
|
+
|
124
|
+
```
|
125
|
+
require 'tuple_processor'
|
126
|
+
require 'pigudf'
|
127
|
+
|
128
|
+
class TupleProcessorUdf < PigUdf
|
129
|
+
# TupleProcessor will automatically generate your UDF schema!, no matter how complex or
|
130
|
+
# convoluted.
|
131
|
+
outputSchema TupleProcessor.schema
|
132
|
+
|
133
|
+
# Use TupleProcessor as the processing logic.
|
134
|
+
def process(line)
|
135
|
+
# since 1 raw json can output several rows, 'process' returns
|
136
|
+
# an array. we pass that to Pig as Pig's own 'DataBag'.
|
137
|
+
# You can also do without and just return whatever TupleProcessor#process returns.
|
138
|
+
databag = DataBag.new
|
139
|
+
TupleProcessor.process(line).each do |res|
|
140
|
+
databag.add(res)
|
141
|
+
end
|
142
|
+
|
143
|
+
databag
|
144
|
+
end
|
145
|
+
end
|
146
|
+
```
|
147
|
+
|
148
|
+
|
149
|
+
## Gems and Jars
|
150
|
+
|
151
|
+
So you can't really use gems or jars with Pig Ruby UDFs unless all machines have
|
152
|
+
them; and then it becomes ugly to manage really.
|
153
|
+
|
154
|
+
The solution is neat: pack everything as you would with a JRuby project,
|
155
|
+
for example, use `warbler`; and your job code becomes a standalone-jar
|
156
|
+
which you deploy, version, and possibly generate via CI.
|
157
|
+
|
158
|
+
|
159
|
+
## Related Projects
|
160
|
+
|
161
|
+
* PigPen - https://github.com/Netflix/PigPen
|
162
|
+
* datafu - https://github.com/linkedin/datafu
|
163
|
+
|
164
|
+
|
165
|
+
# Contributing
|
166
|
+
|
167
|
+
Fork, implement, add tests, pull request, get my everlasting thanks and a respectable place here :).
|
168
|
+
|
169
|
+
# Copyright
|
170
|
+
|
171
|
+
Copyright (c) 2014 [Dotan Nahum](http://gplus.to/dotan) [@jondot](http://twitter.com/jondot). See MIT-LICENSE for further details.
|
172
|
+
|
173
|
+
|
data/Rakefile
ADDED
data/bin/hog
ADDED
data/example/runner.pig
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
-- Since most work is in your TupleProcessor, this file serves as a simple shim
|
2
|
+
-- that loads data into raw lines, and sends them off to your UDF.
|
3
|
+
-- As a result, it will mostly stay the same.
|
4
|
+
-- You can tweak any M/R or Pig params here as well.
|
5
|
+
|
6
|
+
REGISTER 'your-warbled-dependencies-jar.jar'
|
7
|
+
REGISTER 'tuple_processor_udf.rb' USING jruby AS processor;
|
8
|
+
|
9
|
+
data = FOREACH ( LOAD '/raw-data' AS (line:CHARARRAY) )
|
10
|
+
GENERATE flatten(process.process(line));
|
11
|
+
|
12
|
+
store data into '/out-data' USING PARQUETFILE
|
13
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'hog'
|
2
|
+
|
3
|
+
TupleProcessor = Hog.tuple "mytuple" do
|
4
|
+
|
5
|
+
# Your own custom feature extraction. This can be any free style
|
6
|
+
# Ruby or Java code
|
7
|
+
#
|
8
|
+
prepare do |hsh|
|
9
|
+
loc = ip2location(hsh["x-forwarded-for"])
|
10
|
+
hsh.merge!(loc)
|
11
|
+
|
12
|
+
form_factor = formfactor(hsh["user-agent"])
|
13
|
+
hsh.merge!(form_factor)
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
# Describe your data columns (i.e. for Hive), and how you'd like
|
18
|
+
# to pull them out of your nested data.
|
19
|
+
#
|
20
|
+
# You can also put a fixed value, or serialize a sub-nested
|
21
|
+
# structure as-is for later use with Hive's json_object
|
22
|
+
#
|
23
|
+
chararray :id, :path => 'pageview.userid'
|
24
|
+
chararray :created_at, :path => 'pageview.timestamp', :with_nil => ''
|
25
|
+
float :sampling, :value => 1.0
|
26
|
+
chararray :ev_type, :value => 'pageview'
|
27
|
+
chararray :ev_info, :json => 'pageview.headers', :default => {}
|
28
|
+
end
|
29
|
+
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'pigudf'
|
2
|
+
require 'tuple_processor'
|
3
|
+
|
4
|
+
class TupleProcessorUdf < PigUdf
|
5
|
+
# TupleProcessor will automatically generate your UDF schema!, no matter how complex or
|
6
|
+
# convoluted.
|
7
|
+
outputSchema TupleProcessor.schema
|
8
|
+
|
9
|
+
# Use TupleProcessor as the processing logic.
|
10
|
+
def process(line)
|
11
|
+
# since 1 raw json can output several rows, 'process' returns
|
12
|
+
# an array. we pass that to Pig as Pig's own 'DataBag'.
|
13
|
+
# You can also do without and just return whatever TupleProcessor#process returns.
|
14
|
+
databag = DataBag.new
|
15
|
+
TupleProcessor.process(line).each do |res|
|
16
|
+
databag.add(res)
|
17
|
+
end
|
18
|
+
|
19
|
+
databag
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
data/hog.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'hog/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "hog"
|
8
|
+
spec.version = Hog::VERSION
|
9
|
+
spec.authors = ["jondot"]
|
10
|
+
spec.email = ["jondotan@gmail.com"]
|
11
|
+
spec.summary = %q{}
|
12
|
+
spec.description = %q{}
|
13
|
+
spec.homepage = ""
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.6"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
end
|
data/lib/hog/field.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
module Hog
|
2
|
+
class Field
|
3
|
+
attr_reader :opts
|
4
|
+
|
5
|
+
def initialize(type, name, opts={})
|
6
|
+
@name = name
|
7
|
+
@type = type
|
8
|
+
@opts = opts
|
9
|
+
opts[:path] ||= name.to_s
|
10
|
+
end
|
11
|
+
|
12
|
+
def get(hash)
|
13
|
+
val = nil
|
14
|
+
if opts[:value]
|
15
|
+
val = opts[:value]
|
16
|
+
elsif opts[:json]
|
17
|
+
lkval = lookup(opts[:json],hash) || opts[:default]
|
18
|
+
val = JSON.dump(lkval)
|
19
|
+
else
|
20
|
+
val = lookup(opts[:path], hash)
|
21
|
+
end
|
22
|
+
get_value(val)
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_s
|
26
|
+
"#{@name}:#{@type}"
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def get_value(val)
|
32
|
+
# xxx optimize?
|
33
|
+
if @type == "int" && [TrueClass, FalseClass].include?(val.class)
|
34
|
+
val ? 1 : 0
|
35
|
+
elsif @type == "chararray"
|
36
|
+
val.to_s
|
37
|
+
else
|
38
|
+
val
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def lookup(path, hash)
|
43
|
+
begin
|
44
|
+
path.split('.').inject(hash) {|acc, value| acc[value]}
|
45
|
+
rescue
|
46
|
+
return opts[:with_nil] if opts[:with_nil]
|
47
|
+
raise "No path: #{path} in hash: #{hash}"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
data/lib/hog/tuple.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
# todos
|
2
|
+
# - safe hash path lookup + provide default value?
|
3
|
+
|
4
|
+
|
5
|
+
module Hog
|
6
|
+
class Tuple
|
7
|
+
def initialize(name, block)
|
8
|
+
@name = name
|
9
|
+
@fields = []
|
10
|
+
self.instance_eval &block
|
11
|
+
end
|
12
|
+
|
13
|
+
def process(hsh)
|
14
|
+
@prepare.call(hsh)
|
15
|
+
res = []
|
16
|
+
@fields.each do |f|
|
17
|
+
res << f.get(hsh)
|
18
|
+
end
|
19
|
+
res
|
20
|
+
end
|
21
|
+
|
22
|
+
def prepare(&block)
|
23
|
+
@prepare = block
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_s
|
27
|
+
"(#{@fields.map{|f| f.to_s }.join(',')})"
|
28
|
+
end
|
29
|
+
|
30
|
+
def method_missing(meth, *args, &block)
|
31
|
+
if [:chararray, :float, :double, :int, :long, :map].include?(meth)
|
32
|
+
f = Field.new(meth.to_s, *args)
|
33
|
+
@fields << f
|
34
|
+
return f
|
35
|
+
else
|
36
|
+
super
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
data/lib/hog/utils.rb
ADDED
data/lib/hog/version.rb
ADDED
data/lib/hog.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require "hog/version"
|
2
|
+
require 'json'
|
3
|
+
require 'hog/tuple'
|
4
|
+
require 'hog/field'
|
5
|
+
require 'hog/utils'
|
6
|
+
|
7
|
+
module Hog
|
8
|
+
class << self
|
9
|
+
include Hog::Utils
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.tuple(name, &block)
|
13
|
+
Tuple.new(name, block)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
|
metadata
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hog
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- jondot
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2014-06-18 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: bundler
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.6'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.6'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rake
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
description: ''
|
47
|
+
email:
|
48
|
+
- jondotan@gmail.com
|
49
|
+
executables:
|
50
|
+
- hog
|
51
|
+
extensions: []
|
52
|
+
extra_rdoc_files: []
|
53
|
+
files:
|
54
|
+
- .gitignore
|
55
|
+
- Gemfile
|
56
|
+
- LICENSE.txt
|
57
|
+
- README.md
|
58
|
+
- Rakefile
|
59
|
+
- bin/hog
|
60
|
+
- example/runner.pig
|
61
|
+
- example/tuple_processor.rb
|
62
|
+
- example/tuple_processor_udf.rb
|
63
|
+
- hog.gemspec
|
64
|
+
- lib/hog.rb
|
65
|
+
- lib/hog/field.rb
|
66
|
+
- lib/hog/testing/pig_stubs.rb
|
67
|
+
- lib/hog/tuple.rb
|
68
|
+
- lib/hog/utils.rb
|
69
|
+
- lib/hog/version.rb
|
70
|
+
homepage: ''
|
71
|
+
licenses:
|
72
|
+
- MIT
|
73
|
+
post_install_message:
|
74
|
+
rdoc_options: []
|
75
|
+
require_paths:
|
76
|
+
- lib
|
77
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
78
|
+
none: false
|
79
|
+
requirements:
|
80
|
+
- - ! '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
84
|
+
none: false
|
85
|
+
requirements:
|
86
|
+
- - ! '>='
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
requirements: []
|
90
|
+
rubyforge_project:
|
91
|
+
rubygems_version: 1.8.23
|
92
|
+
signing_key:
|
93
|
+
specification_version: 3
|
94
|
+
summary: ''
|
95
|
+
test_files: []
|