cascading.jruby 0.0.9 → 0.0.10
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +15 -0
- data/lib/cascading/assembly.rb +138 -17
- data/lib/cascading/base.rb +0 -4
- data/lib/cascading/cascade.rb +25 -16
- data/lib/cascading/cascading.rb +25 -5
- data/lib/cascading/ext/array.rb +1 -7
- data/lib/cascading/flow.rb +18 -19
- data/lib/cascading/mode.rb +5 -1
- data/lib/cascading/operations.rb +11 -4
- data/lib/cascading/tap.rb +4 -0
- data/lib/cascading.rb +1 -5
- data/test/test_assembly.rb +135 -29
- data/test/test_cascade.rb +80 -0
- data/test/test_flow.rb +20 -0
- data/test/test_operations.rb +3 -2
- metadata +6 -76
- data/.travis.yml +0 -6
- data/Gemfile +0 -6
- data/Gemfile.lock +0 -12
- data/HACKING.md +0 -23
- data/README.md +0 -9
- data/Rakefile +0 -46
- data/TODO +0 -13
- data/bin/make_job +0 -81
- data/ivy.xml +0 -25
- data/ivysettings.xml +0 -7
- data/samples/branch.rb +0 -30
- data/samples/copy.rb +0 -20
- data/samples/data/data2.txt +0 -88799
- data/samples/data/data_group_by.txt +0 -7
- data/samples/data/data_join1.txt +0 -3
- data/samples/data/data_join2.txt +0 -3
- data/samples/data/data_join3.txt +0 -3
- data/samples/data/genealogy/names/dist.all.last +0 -88799
- data/samples/data/gutenberg/the_outline_of_science_vol_1 +0 -12761
- data/samples/group_by.rb +0 -61
- data/samples/join.rb +0 -31
- data/samples/logwordcount.rb +0 -22
- data/samples/project.rb +0 -23
- data/samples/rename.rb +0 -20
- data/samples/scorenames.rb +0 -20
- data/samples/splitter.rb +0 -19
- data/samples/sub_assembly.rb +0 -30
- data/samples/union.rb +0 -36
- data/spec/cascading_spec.rb +0 -105
- data/spec/expr_spec.rb +0 -230
- data/spec/jruby_version_spec.rb +0 -72
- data/spec/resource/join_input.txt +0 -3
- data/spec/resource/test_input.txt +0 -4
- data/spec/scope_spec.rb +0 -149
- data/spec/spec.opts +0 -6
- data/spec/spec_helper.rb +0 -5
- data/spec/spec_util.rb +0 -92
- data/src/cascading/jruby/Main.java +0 -38
- data/src/cascading/jruby/runner.rb +0 -6
- data/tags +0 -342
- data/tasks/ann.rake +0 -80
- data/tasks/ant.rake +0 -23
- data/tasks/bones.rake +0 -20
- data/tasks/gem.rake +0 -206
- data/tasks/git.rake +0 -40
- data/tasks/notes.rake +0 -27
- data/tasks/post_load.rake +0 -34
- data/tasks/rdoc.rake +0 -50
- data/tasks/rubyforge.rake +0 -55
- data/tasks/samples.rake +0 -19
- data/tasks/setup.rb +0 -300
- data/tasks/spec.rake +0 -59
- data/tasks/svn.rake +0 -47
- data/tasks/test.rake +0 -42
- data/test/data/data1.txt +0 -14
- data/test/data/data2.txt +0 -14
- data/test/mock_assemblies.rb +0 -55
data/bin/make_job
DELETED
@@ -1,81 +0,0 @@
|
|
1
|
-
#!/usr/bin/env jruby
|
2
|
-
|
3
|
-
require 'java'
|
4
|
-
|
5
|
-
$LOAD_PATH.unshift(::File.expand_path(::File.join(::File.dirname(__FILE__), "..", "jruby")))
|
6
|
-
$LOAD_PATH.unshift(::File.expand_path(::File.join(::File.dirname(__FILE__), "..", "jobs")))
|
7
|
-
|
8
|
-
require 'rubygems'
|
9
|
-
require 'cascading'
|
10
|
-
require 'fileutils'
|
11
|
-
require 'optparse'
|
12
|
-
require 'ostruct'
|
13
|
-
|
14
|
-
include FileUtils
|
15
|
-
|
16
|
-
TEMP_DIR = "_temp_jars"
|
17
|
-
TEMP_DIR_LIB = ::File.join(TEMP_DIR, "lib")
|
18
|
-
CASCADING_JRUBY_HOME = Cascading::PATH
|
19
|
-
CASCADING_HOME = ENV["CASCADING_HOME"]
|
20
|
-
JRUBY_HOME = ENV["JRUBY_HOME"]
|
21
|
-
|
22
|
-
options = OpenStruct.new
|
23
|
-
options.input = ARGV[0]
|
24
|
-
options.output = "job.jar"
|
25
|
-
options.libs = []
|
26
|
-
|
27
|
-
OptionParser.new do |opts|
|
28
|
-
opts.banner = "Usage: make_job [options]"
|
29
|
-
|
30
|
-
opts.on("-o", "--output", "Set the name of the output jar file (job.jar by default)") do |v|
|
31
|
-
options.output = v
|
32
|
-
end
|
33
|
-
|
34
|
-
opts.on("-l", "--lib LIBPATH", "Set the path where external libraries are stored") do |path|
|
35
|
-
options.libs << path
|
36
|
-
end
|
37
|
-
end.parse!
|
38
|
-
|
39
|
-
p options
|
40
|
-
|
41
|
-
# Create temp dir
|
42
|
-
mkdir(TEMP_DIR) unless File.exists? TEMP_DIR
|
43
|
-
mkdir(TEMP_DIR_LIB) unless File.exists? TEMP_DIR_LIB
|
44
|
-
|
45
|
-
def copy(from, to, message=nil)
|
46
|
-
puts message if message
|
47
|
-
Dir.glob(from).each do |f|
|
48
|
-
cp_r(f, to)
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
# Copy job files into TEMP_DIR:
|
53
|
-
files = ::File.join(options.input, "**", "*.rb")
|
54
|
-
copy(files, TEMP_DIR, "Copying job files to temp dir...")
|
55
|
-
|
56
|
-
# Copy external libs into TEMP_DIR:
|
57
|
-
for lib in options.libs
|
58
|
-
files = ::File.join(lib, "**", "*.jar")
|
59
|
-
copy(files, TEMP_DIR_LIB, "Copying external libs to temp dir...")
|
60
|
-
end
|
61
|
-
|
62
|
-
files = ::File.join(CASCADING_JRUBY_HOME, "lib", "**")
|
63
|
-
copy(files, TEMP_DIR, "Copying cascading.jruby files to temp dir...")
|
64
|
-
|
65
|
-
# Copy cascading.jruby.runner classes:
|
66
|
-
files = ::File.join(CASCADING_JRUBY_HOME, "classes", "**")
|
67
|
-
copy(files, TEMP_DIR, "Copying cascading.jruby files to temp dir...")
|
68
|
-
|
69
|
-
# Copy cascading jars in _temp_jars/lib
|
70
|
-
files = ::File.join(CASCADING_HOME, "**", "*.jar")
|
71
|
-
copy(files, TEMP_DIR_LIB, "Copying Cascading jars to temp dir...")
|
72
|
-
|
73
|
-
# Jar the whole thing:
|
74
|
-
puts "Building final jar file (#{options.output})..."
|
75
|
-
system("jar cvf #{options.output} -C #{TEMP_DIR}/ .")
|
76
|
-
|
77
|
-
# Clean-up things
|
78
|
-
puts "Cleaning temp dir..."
|
79
|
-
rm_rf(TEMP_DIR)
|
80
|
-
|
81
|
-
puts "Finished. Have Fun!"
|
data/ivy.xml
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
2
|
-
<ivy-module version="2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
3
|
-
xsi:noNamespaceSchemaLocation="http://ant.apache.org/ivy/schemas/ivy.xsd">
|
4
|
-
<info organisation="com.etsy" module="cascading.jruby" status="integration" />
|
5
|
-
|
6
|
-
<configurations>
|
7
|
-
<conf name="default" visibility="public" description="runtime dependencies and master artifact can be used with this conf" extends="runtime,master" />
|
8
|
-
<conf name="master" visibility="public" description="contains only the artifact published by this module itself, with no transitive dependencies" />
|
9
|
-
<conf name="compile" visibility="public" description="this is the default scope, used if none is specified. Compile dependencies are available in all classpaths." />
|
10
|
-
<conf name="provided" visibility="public" description="this is much like compile, but indicates you expect the JDK or a container to provide it. It is only available on the compilation classpath, and is not transitive." />
|
11
|
-
<conf name="runtime" visibility="public" description="this scope indicates that the dependency is not required for compilation, but is for execution. It is in the runtime and test classpaths, but not the compile classpath." extends="compile" />
|
12
|
-
<conf name="test" visibility="private" description="this scope indicates that the dependency is not required for normal use of the application, and is only available for the test compilation and execution phases." extends="runtime" />
|
13
|
-
<conf name="system" visibility="public" description="this scope is similar to provided except that you have to provide the JAR which contains it explicitly. The artifact is always available and is not looked up in a repository." />
|
14
|
-
<conf name="sources" visibility="public" description="this configuration contains the source artifact of this module, if any." />
|
15
|
-
<conf name="javadoc" visibility="public" description="this configuration contains the javadoc artifact of this module, if any." />
|
16
|
-
<conf name="optional" visibility="public" description="contains all optional dependencies" />
|
17
|
-
</configurations>
|
18
|
-
|
19
|
-
<dependencies>
|
20
|
-
<dependency org="cascading" name="cascading-core" rev="2.0.0" conf="default" />
|
21
|
-
<dependency org="cascading" name="cascading-local" rev="2.0.0" conf="default" />
|
22
|
-
<dependency org="cascading" name="cascading-hadoop" rev="2.0.0" conf="default" />
|
23
|
-
<dependency org="org.jruby" name="jruby" rev="1.6.5" conf="default" />
|
24
|
-
</dependencies>
|
25
|
-
</ivy-module>
|
data/ivysettings.xml
DELETED
data/samples/branch.rb
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
#! /usr/bin/env jruby
|
2
|
-
|
3
|
-
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
4
|
-
|
5
|
-
require 'cascading'
|
6
|
-
|
7
|
-
cascade 'branch', :mode => :local do
|
8
|
-
flow 'branch' do
|
9
|
-
source 'input', tap('samples/data/data2.txt')
|
10
|
-
|
11
|
-
assembly 'input' do
|
12
|
-
split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/
|
13
|
-
|
14
|
-
branch 'branch1' do
|
15
|
-
group_by 'score1' do
|
16
|
-
count
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
branch 'branch2' do
|
21
|
-
group_by 'score2' do
|
22
|
-
count
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
sink 'branch1', tap('output/branch1', :sink_mode => :replace)
|
28
|
-
sink 'branch2', tap('output/branch2', :sink_mode => :replace)
|
29
|
-
end
|
30
|
-
end.complete
|
data/samples/copy.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
#! /usr/bin/env jruby
|
2
|
-
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
3
|
-
|
4
|
-
require 'cascading'
|
5
|
-
|
6
|
-
cascade 'copy', :mode => :local do
|
7
|
-
flow 'copy' do
|
8
|
-
# You don't have to curl and cache inputs: tap can fetch via HTTP
|
9
|
-
#source 'input', tap('http://www.census.gov/genealogy/names/dist.all.last')
|
10
|
-
source 'input', tap('samples/data/genealogy/names/dist.all.last')
|
11
|
-
|
12
|
-
assembly 'input' do
|
13
|
-
rename 'line' => 'value'
|
14
|
-
# We override validate_with because we know line will never be null
|
15
|
-
reject 'value:string.indexOf("R") == -1', :validate_with => { :value => 'nothinghere' }
|
16
|
-
end
|
17
|
-
|
18
|
-
sink 'input', tap('output/copy', :sink_mode => :replace)
|
19
|
-
end
|
20
|
-
end.complete
|