elastic-mapreduce 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. data/CHANGELOG +51 -0
  2. data/Gemfile +13 -0
  3. data/Gemfile.lock +16 -0
  4. data/LICENSE.txt +393 -0
  5. data/NOTICE.txt +26 -0
  6. data/README +1007 -0
  7. data/Rakefile +35 -0
  8. data/VERSION +1 -0
  9. data/bin/elastic-mapreduce +27 -0
  10. data/cacert.pem +280 -0
  11. data/elastic-mapreduce.gemspec +104 -0
  12. data/lib/amazon/aws/exceptions.rb +211 -0
  13. data/lib/amazon/coral/awsquery.rb +128 -0
  14. data/lib/amazon/coral/awsquerychainhelper.rb +92 -0
  15. data/lib/amazon/coral/awsqueryhandler.rb +170 -0
  16. data/lib/amazon/coral/awsqueryurihandler.rb +34 -0
  17. data/lib/amazon/coral/call.rb +68 -0
  18. data/lib/amazon/coral/dispatcher.rb +33 -0
  19. data/lib/amazon/coral/ec2client.rb +91 -0
  20. data/lib/amazon/coral/elasticmapreduceclient.rb +198 -0
  21. data/lib/amazon/coral/handler.rb +20 -0
  22. data/lib/amazon/coral/httpdelegationhelper.rb +27 -0
  23. data/lib/amazon/coral/httpdestinationhandler.rb +36 -0
  24. data/lib/amazon/coral/httphandler.rb +124 -0
  25. data/lib/amazon/coral/identityhandler.rb +32 -0
  26. data/lib/amazon/coral/job.rb +25 -0
  27. data/lib/amazon/coral/logfactory.rb +35 -0
  28. data/lib/amazon/coral/option.rb +70 -0
  29. data/lib/amazon/coral/orchestrator.rb +49 -0
  30. data/lib/amazon/coral/querystringmap.rb +93 -0
  31. data/lib/amazon/coral/service.rb +130 -0
  32. data/lib/amazon/coral/simplelog.rb +98 -0
  33. data/lib/amazon/coral/urlencoding.rb +19 -0
  34. data/lib/amazon/coral/v0signaturehandler.rb +33 -0
  35. data/lib/amazon/coral/v0signaturehelper.rb +83 -0
  36. data/lib/amazon/coral/v1signaturehandler.rb +32 -0
  37. data/lib/amazon/coral/v1signaturehelper.rb +58 -0
  38. data/lib/amazon/coral/v2signaturehandler.rb +46 -0
  39. data/lib/amazon/coral/v2signaturehelper.rb +76 -0
  40. data/lib/amazon/retry_delegator.rb +66 -0
  41. data/lib/amazon/stderr_logger.rb +23 -0
  42. data/lib/client.rb +117 -0
  43. data/lib/commands.rb +1690 -0
  44. data/lib/credentials.rb +86 -0
  45. data/lib/ec2_client_wrapper.rb +73 -0
  46. data/lib/json/lexer.rb +294 -0
  47. data/lib/json/objects.rb +200 -0
  48. data/lib/json.rb +58 -0
  49. data/lib/simple_executor.rb +11 -0
  50. data/lib/simple_logger.rb +38 -0
  51. data/lib/uuidtools/version.rb +32 -0
  52. data/lib/uuidtools.rb +655 -0
  53. data/run_tests.rb +8 -0
  54. data/samples/freebase/code/freebase_jobflow.json +44 -0
  55. data/samples/similarity/lastfm_jobflow.json +78 -0
  56. data/samples/wordSplitter.py +18 -0
  57. data/tests/commands_test.rb +587 -0
  58. data/tests/credentials.json +7 -0
  59. data/tests/example.json +14 -0
  60. metadata +154 -0
@@ -0,0 +1,44 @@
1
+ [
2
+ {
3
+ "Name": "MR Step 1: What are the most popular Freebase IDs?",
4
+ "ActionOnFailure": "TERMINATE_JOB_FLOW",
5
+ "HadoopJarStep": {
6
+ "Jar": "/home/hadoop/contrib/streaming/hadoop-0.18-streaming.jar",
7
+ "Args": [
8
+ "-input", "s3n://elasticmapreduce/samples/freebase/input/",
9
+ "-output", "s3n://<bucket>/freebase/step1out/",
10
+ "-mapper", "s3n://elasticmapreduce/samples/freebase/code/mapper.py"
11
+ ]
12
+ }
13
+ },
14
+ {
15
+ "Name": "MR Step 2: Publish top occurences into SimpleDB",
16
+ "ActionOnFailure": "TERMINATE_JOB_FLOW",
17
+ "HadoopJarStep": {
18
+ "Jar": "/home/hadoop/contrib/streaming/hadoop-0.18-streaming.jar",
19
+ "Args": [
20
+ "-input", "s3n://<bucket>/freebase/step1out/",
21
+ "-output", "s3n://<bucket>/freebase/step2out/",
22
+ "-mapper", "s3n://elasticmapreduce/samples/freebase/code/top_sdb_mapper.rb",
23
+ "-reducer", "s3n://elasticmapreduce/samples/freebase/code/top_sdb_reducer.rb",
24
+ "-cacheFile", "s3n://elasticmapreduce/samples/freebase/code/base64.rb#base64.rb",
25
+ "-cacheFile", "s3n://elasticmapreduce/samples/freebase/code/aws_sdb.rb#aws_sdb.rb"
26
+ ]
27
+ }
28
+ },
29
+ {
30
+ "Name": "MR Step 3: Publish the names into SimpleDB",
31
+ "ActionOnFailure": "TERMINATE_JOB_FLOW",
32
+ "HadoopJarStep": {
33
+ "Jar": "/home/hadoop/contrib/streaming/hadoop-0.18-streaming.jar",
34
+ "Args": [
35
+ "-input", "s3n://elasticmapreduce/samples/freebase/input/",
36
+ "-output", "s3n://<bucket>/freebase/names/step1",
37
+ "-mapper", "s3n://elasticmapreduce/samples/freebase/code/name_mapper.rb",
38
+ "-reducer", "s3n://elasticmapreduce/samples/freebase/code/name_reducer.rb",
39
+ "-cacheFile", "s3n://elasticmapreduce/samples/freebase/code/base64.rb#base64.rb",
40
+ "-cacheFile", "s3n://elasticmapreduce/samples/freebase/code/aws_sdb.rb#aws_sdb.rb"
41
+ ]
42
+ }
43
+ }
44
+ ]
@@ -0,0 +1,78 @@
1
+ [
2
+ {
3
+ "Name": "MR Step 1: Count number of ratings for each item, use single reducer",
4
+ "ActionOnFailure": "TERMINATE_JOB_FLOW",
5
+ "HadoopJarStep": {
6
+ "Jar": "/home/hadoop/contrib/streaming/hadoop-0.18-streaming.jar",
7
+ "Args": [
8
+ "-input", "s3n://elasticmapreduce/samples/similarity/lastfm/input/",
9
+ "-output", "s3n://<bucket>/lastfm/item-counts/",
10
+ "-mapper", "python similarity.py mapper1",
11
+ "-reducer", "python similarity.py reducer1",
12
+ "-cacheFile", "s3n://elasticmapreduce/samples/similarity/similarity.py#similarity.py",
13
+ "-jobconf", "mapred.map.tasks=36",
14
+ "-jobconf", "mapred.reduce.tasks=1",
15
+ "-jobconf", "mapred.compress.map.output=true"
16
+ ]
17
+ }
18
+ },
19
+ {
20
+ "Name": "MR Step 2: Generate sorted item postings with KeyFieldBasedPartitioner",
21
+ "ActionOnFailure": "TERMINATE_JOB_FLOW",
22
+ "HadoopJarStep": {
23
+ "Jar": "/home/hadoop/contrib/streaming/hadoop-0.18-streaming.jar",
24
+ "Args": [
25
+ "-input", "s3n://elasticmapreduce/samples/similarity/lastfm/input/",
26
+ "-output", "hdfs:///home/hadoop/output2/",
27
+ "-mapper", "python similarity.py mapper2 log",
28
+ "-reducer", "python similarity.py reducer2",
29
+ "-cacheFile", "s3n://elasticmapreduce/samples/similarity/similarity.py#similarity.py",
30
+ "-jobconf", "mapred.map.tasks=36",
31
+ "-jobconf", "mapred.reduce.tasks=18",
32
+ "-partitioner", "org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner",
33
+ "-jobconf", "map.output.key.field.separator=,",
34
+ "-jobconf", "num.key.fields.for.partition=1",
35
+ "-jobconf", "mapred.compress.map.output=true"
36
+ ]
37
+ }
38
+ },
39
+ {
40
+ "Name": "MR Step 3: Item Similarity using Random Sampling & Distributed Cache",
41
+ "ActionOnFailure": "TERMINATE_JOB_FLOW",
42
+ "HadoopJarStep": {
43
+ "Jar": "/home/hadoop/contrib/streaming/hadoop-0.18-streaming.jar",
44
+ "Args": [
45
+ "-input", "hdfs:///home/hadoop/output2/",
46
+ "-output", "hdfs:///home/hadoop/output3/",
47
+ "-mapper", "python similarity.py mapper3 90 item_rating_counts.txt",
48
+ "-reducer", "python similarity.py reducer3 147160",
49
+ "-cacheFile", "s3n://elasticmapreduce/samples/similarity/similarity.py#similarity.py",
50
+ "-cacheFile", "s3n://<bucket>/lastfm/item-counts/part-00000#item_rating_counts.txt",
51
+ "-jobconf", "mapred.map.tasks=36",
52
+ "-jobconf", "mapred.reduce.tasks=18",
53
+ "-jobconf", "mapred.compress.map.output=true"
54
+ ]
55
+ }
56
+ },
57
+ {
58
+ "Name": "MR Step 4: For each item, emit K=25 most similar items with KeyFieldBasedPartitioner",
59
+ "ActionOnFailure": "TERMINATE_JOB_FLOW",
60
+ "HadoopJarStep": {
61
+ "Jar": "/home/hadoop/contrib/streaming/hadoop-0.18-streaming.jar",
62
+ "Args": [
63
+ "-input", "hdfs:///home/hadoop/output3/",
64
+ "-output", "s3n://<bucket>/lastfm/output-large-50/",
65
+ "-mapper", "python similarity.py mapper4 5",
66
+ "-reducer", "python similarity.py reducer4 25 artist_data.txt",
67
+ "-cacheFile", "s3n://elasticmapreduce/samples/similarity/lastfm/artist_data.txt#artist_data.txt",
68
+ "-cacheFile", "s3n://elasticmapreduce/samples/similarity/similarity.py#similarity.py",
69
+ "-jobconf", "mapred.map.tasks=36",
70
+ "-jobconf", "mapred.reduce.tasks=18",
71
+ "-partitioner", "org.apache.hadoop.mapred.lib.KeyFieldBasedPartitioner",
72
+ "-jobconf", "map.output.key.field.separator=,",
73
+ "-jobconf", "num.key.fields.for.partition=1",
74
+ "-jobconf", "mapred.compress.map.output=true"
75
+ ]
76
+ }
77
+ }
78
+ ]
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/python
2
+
3
+ import sys
4
+ import re
5
+
6
+ def main(argv):
7
+ line = sys.stdin.readline()
8
+ pattern = re.compile("[a-zA-Z][a-zA-Z0-9]*")
9
+ try:
10
+ while line:
11
+ for word in pattern.findall(line):
12
+ print "LongValueSum:" + word.lower() + "\t" + "1"
13
+ line = sys.stdin.readline()
14
+ except "end of file":
15
+ return None
16
+ if __name__ == "__main__":
17
+ main(sys.argv)
18
+