cascading.jruby 0.0.9 → 0.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +15 -0
- data/lib/cascading/assembly.rb +138 -17
- data/lib/cascading/base.rb +0 -4
- data/lib/cascading/cascade.rb +25 -16
- data/lib/cascading/cascading.rb +25 -5
- data/lib/cascading/ext/array.rb +1 -7
- data/lib/cascading/flow.rb +18 -19
- data/lib/cascading/mode.rb +5 -1
- data/lib/cascading/operations.rb +11 -4
- data/lib/cascading/tap.rb +4 -0
- data/lib/cascading.rb +1 -5
- data/test/test_assembly.rb +135 -29
- data/test/test_cascade.rb +80 -0
- data/test/test_flow.rb +20 -0
- data/test/test_operations.rb +3 -2
- metadata +6 -76
- data/.travis.yml +0 -6
- data/Gemfile +0 -6
- data/Gemfile.lock +0 -12
- data/HACKING.md +0 -23
- data/README.md +0 -9
- data/Rakefile +0 -46
- data/TODO +0 -13
- data/bin/make_job +0 -81
- data/ivy.xml +0 -25
- data/ivysettings.xml +0 -7
- data/samples/branch.rb +0 -30
- data/samples/copy.rb +0 -20
- data/samples/data/data2.txt +0 -88799
- data/samples/data/data_group_by.txt +0 -7
- data/samples/data/data_join1.txt +0 -3
- data/samples/data/data_join2.txt +0 -3
- data/samples/data/data_join3.txt +0 -3
- data/samples/data/genealogy/names/dist.all.last +0 -88799
- data/samples/data/gutenberg/the_outline_of_science_vol_1 +0 -12761
- data/samples/group_by.rb +0 -61
- data/samples/join.rb +0 -31
- data/samples/logwordcount.rb +0 -22
- data/samples/project.rb +0 -23
- data/samples/rename.rb +0 -20
- data/samples/scorenames.rb +0 -20
- data/samples/splitter.rb +0 -19
- data/samples/sub_assembly.rb +0 -30
- data/samples/union.rb +0 -36
- data/spec/cascading_spec.rb +0 -105
- data/spec/expr_spec.rb +0 -230
- data/spec/jruby_version_spec.rb +0 -72
- data/spec/resource/join_input.txt +0 -3
- data/spec/resource/test_input.txt +0 -4
- data/spec/scope_spec.rb +0 -149
- data/spec/spec.opts +0 -6
- data/spec/spec_helper.rb +0 -5
- data/spec/spec_util.rb +0 -92
- data/src/cascading/jruby/Main.java +0 -38
- data/src/cascading/jruby/runner.rb +0 -6
- data/tags +0 -342
- data/tasks/ann.rake +0 -80
- data/tasks/ant.rake +0 -23
- data/tasks/bones.rake +0 -20
- data/tasks/gem.rake +0 -206
- data/tasks/git.rake +0 -40
- data/tasks/notes.rake +0 -27
- data/tasks/post_load.rake +0 -34
- data/tasks/rdoc.rake +0 -50
- data/tasks/rubyforge.rake +0 -55
- data/tasks/samples.rake +0 -19
- data/tasks/setup.rb +0 -300
- data/tasks/spec.rake +0 -59
- data/tasks/svn.rake +0 -47
- data/tasks/test.rake +0 -42
- data/test/data/data1.txt +0 -14
- data/test/data/data2.txt +0 -14
- data/test/mock_assemblies.rb +0 -55
data/bin/make_job
DELETED
@@ -1,81 +0,0 @@
|
|
1
|
-
#!/usr/bin/env jruby
|
2
|
-
|
3
|
-
require 'java'
|
4
|
-
|
5
|
-
$LOAD_PATH.unshift(::File.expand_path(::File.join(::File.dirname(__FILE__), "..", "jruby")))
|
6
|
-
$LOAD_PATH.unshift(::File.expand_path(::File.join(::File.dirname(__FILE__), "..", "jobs")))
|
7
|
-
|
8
|
-
require 'rubygems'
|
9
|
-
require 'cascading'
|
10
|
-
require 'fileutils'
|
11
|
-
require 'optparse'
|
12
|
-
require 'ostruct'
|
13
|
-
|
14
|
-
include FileUtils
|
15
|
-
|
16
|
-
TEMP_DIR = "_temp_jars"
|
17
|
-
TEMP_DIR_LIB = ::File.join(TEMP_DIR, "lib")
|
18
|
-
CASCADING_JRUBY_HOME = Cascading::PATH
|
19
|
-
CASCADING_HOME = ENV["CASCADING_HOME"]
|
20
|
-
JRUBY_HOME = ENV["JRUBY_HOME"]
|
21
|
-
|
22
|
-
options = OpenStruct.new
|
23
|
-
options.input = ARGV[0]
|
24
|
-
options.output = "job.jar"
|
25
|
-
options.libs = []
|
26
|
-
|
27
|
-
OptionParser.new do |opts|
|
28
|
-
opts.banner = "Usage: make_job [options]"
|
29
|
-
|
30
|
-
opts.on("-o", "--output", "Set the name of the output jar file (job.jar by default)") do |v|
|
31
|
-
options.output = v
|
32
|
-
end
|
33
|
-
|
34
|
-
opts.on("-l", "--lib LIBPATH", "Set the path where external libraries are stored") do |path|
|
35
|
-
options.libs << path
|
36
|
-
end
|
37
|
-
end.parse!
|
38
|
-
|
39
|
-
p options
|
40
|
-
|
41
|
-
# Create temp dir
|
42
|
-
mkdir(TEMP_DIR) unless File.exists? TEMP_DIR
|
43
|
-
mkdir(TEMP_DIR_LIB) unless File.exists? TEMP_DIR_LIB
|
44
|
-
|
45
|
-
def copy(from, to, message=nil)
|
46
|
-
puts message if message
|
47
|
-
Dir.glob(from).each do |f|
|
48
|
-
cp_r(f, to)
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
# Copy job files into TEMP_DIR:
|
53
|
-
files = ::File.join(options.input, "**", "*.rb")
|
54
|
-
copy(files, TEMP_DIR, "Copying job files to temp dir...")
|
55
|
-
|
56
|
-
# Copy external libs into TEMP_DIR:
|
57
|
-
for lib in options.libs
|
58
|
-
files = ::File.join(lib, "**", "*.jar")
|
59
|
-
copy(files, TEMP_DIR_LIB, "Copying external libs to temp dir...")
|
60
|
-
end
|
61
|
-
|
62
|
-
files = ::File.join(CASCADING_JRUBY_HOME, "lib", "**")
|
63
|
-
copy(files, TEMP_DIR, "Copying cascading.jruby files to temp dir...")
|
64
|
-
|
65
|
-
# Copy cascading.jruby.runner classes:
|
66
|
-
files = ::File.join(CASCADING_JRUBY_HOME, "classes", "**")
|
67
|
-
copy(files, TEMP_DIR, "Copying cascading.jruby files to temp dir...")
|
68
|
-
|
69
|
-
# Copy cascading jars in _temp_jars/lib
|
70
|
-
files = ::File.join(CASCADING_HOME, "**", "*.jar")
|
71
|
-
copy(files, TEMP_DIR_LIB, "Copying Cascading jars to temp dir...")
|
72
|
-
|
73
|
-
# Jar the whole thing:
|
74
|
-
puts "Building final jar file (#{options.output})..."
|
75
|
-
system("jar cvf #{options.output} -C #{TEMP_DIR}/ .")
|
76
|
-
|
77
|
-
# Clean-up things
|
78
|
-
puts "Cleaning temp dir..."
|
79
|
-
rm_rf(TEMP_DIR)
|
80
|
-
|
81
|
-
puts "Finished. Have Fun!"
|
data/ivy.xml
DELETED
@@ -1,25 +0,0 @@
|
|
1
|
-
<?xml version="1.0" encoding="ISO-8859-1"?>
|
2
|
-
<ivy-module version="2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
3
|
-
xsi:noNamespaceSchemaLocation="http://ant.apache.org/ivy/schemas/ivy.xsd">
|
4
|
-
<info organisation="com.etsy" module="cascading.jruby" status="integration" />
|
5
|
-
|
6
|
-
<configurations>
|
7
|
-
<conf name="default" visibility="public" description="runtime dependencies and master artifact can be used with this conf" extends="runtime,master" />
|
8
|
-
<conf name="master" visibility="public" description="contains only the artifact published by this module itself, with no transitive dependencies" />
|
9
|
-
<conf name="compile" visibility="public" description="this is the default scope, used if none is specified. Compile dependencies are available in all classpaths." />
|
10
|
-
<conf name="provided" visibility="public" description="this is much like compile, but indicates you expect the JDK or a container to provide it. It is only available on the compilation classpath, and is not transitive." />
|
11
|
-
<conf name="runtime" visibility="public" description="this scope indicates that the dependency is not required for compilation, but is for execution. It is in the runtime and test classpaths, but not the compile classpath." extends="compile" />
|
12
|
-
<conf name="test" visibility="private" description="this scope indicates that the dependency is not required for normal use of the application, and is only available for the test compilation and execution phases." extends="runtime" />
|
13
|
-
<conf name="system" visibility="public" description="this scope is similar to provided except that you have to provide the JAR which contains it explicitly. The artifact is always available and is not looked up in a repository." />
|
14
|
-
<conf name="sources" visibility="public" description="this configuration contains the source artifact of this module, if any." />
|
15
|
-
<conf name="javadoc" visibility="public" description="this configuration contains the javadoc artifact of this module, if any." />
|
16
|
-
<conf name="optional" visibility="public" description="contains all optional dependencies" />
|
17
|
-
</configurations>
|
18
|
-
|
19
|
-
<dependencies>
|
20
|
-
<dependency org="cascading" name="cascading-core" rev="2.0.0" conf="default" />
|
21
|
-
<dependency org="cascading" name="cascading-local" rev="2.0.0" conf="default" />
|
22
|
-
<dependency org="cascading" name="cascading-hadoop" rev="2.0.0" conf="default" />
|
23
|
-
<dependency org="org.jruby" name="jruby" rev="1.6.5" conf="default" />
|
24
|
-
</dependencies>
|
25
|
-
</ivy-module>
|
data/ivysettings.xml
DELETED
data/samples/branch.rb
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
#! /usr/bin/env jruby
|
2
|
-
|
3
|
-
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
4
|
-
|
5
|
-
require 'cascading'
|
6
|
-
|
7
|
-
cascade 'branch', :mode => :local do
|
8
|
-
flow 'branch' do
|
9
|
-
source 'input', tap('samples/data/data2.txt')
|
10
|
-
|
11
|
-
assembly 'input' do
|
12
|
-
split 'line', ['name', 'score1', 'score2', 'id'], :pattern => /[.,]*\s+/
|
13
|
-
|
14
|
-
branch 'branch1' do
|
15
|
-
group_by 'score1' do
|
16
|
-
count
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
branch 'branch2' do
|
21
|
-
group_by 'score2' do
|
22
|
-
count
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
sink 'branch1', tap('output/branch1', :sink_mode => :replace)
|
28
|
-
sink 'branch2', tap('output/branch2', :sink_mode => :replace)
|
29
|
-
end
|
30
|
-
end.complete
|
data/samples/copy.rb
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
#! /usr/bin/env jruby
|
2
|
-
$: << File.join(File.dirname(__FILE__), '..', 'lib')
|
3
|
-
|
4
|
-
require 'cascading'
|
5
|
-
|
6
|
-
cascade 'copy', :mode => :local do
|
7
|
-
flow 'copy' do
|
8
|
-
# You don't have to curl and cache inputs: tap can fetch via HTTP
|
9
|
-
#source 'input', tap('http://www.census.gov/genealogy/names/dist.all.last')
|
10
|
-
source 'input', tap('samples/data/genealogy/names/dist.all.last')
|
11
|
-
|
12
|
-
assembly 'input' do
|
13
|
-
rename 'line' => 'value'
|
14
|
-
# We override validate_with because we know line will never be null
|
15
|
-
reject 'value:string.indexOf("R") == -1', :validate_with => { :value => 'nothinghere' }
|
16
|
-
end
|
17
|
-
|
18
|
-
sink 'input', tap('output/copy', :sink_mode => :replace)
|
19
|
-
end
|
20
|
-
end.complete
|