wukong 1.5.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. data/CHANGELOG.textile +32 -0
  2. data/README.textile +58 -12
  3. data/TODO.textile +0 -8
  4. data/bin/hdp-bzip +12 -17
  5. data/bin/hdp-kill-task +1 -1
  6. data/bin/hdp-sort +7 -7
  7. data/bin/hdp-stream +7 -7
  8. data/bin/hdp-stream-flat +2 -3
  9. data/bin/setcat +11 -0
  10. data/bin/uniq-ord +59 -0
  11. data/examples/corpus/bucket_counter.rb +47 -0
  12. data/examples/corpus/dbpedia_abstract_to_sentences.rb +85 -0
  13. data/examples/corpus/sentence_coocurrence.rb +70 -0
  14. data/examples/emr/README.textile +110 -0
  15. data/examples/emr/dot_wukong_dir/emr_bootstrap.sh +1 -0
  16. data/examples/emr/elastic_mapreduce_example.rb +2 -2
  17. data/examples/ignore_me/counting.rb +56 -0
  18. data/examples/ignore_me/grouper.rb +71 -0
  19. data/examples/network_graph/adjacency_list.rb +2 -2
  20. data/examples/network_graph/breadth_first_search.rb +14 -21
  21. data/examples/network_graph/gen_multi_edge.rb +22 -13
  22. data/examples/pagerank/pagerank.rb +1 -1
  23. data/examples/pagerank/pagerank_initialize.rb +6 -10
  24. data/examples/sample_records.rb +6 -16
  25. data/examples/server_logs/apache_log_parser.rb +7 -22
  26. data/examples/server_logs/breadcrumbs.rb +39 -0
  27. data/examples/server_logs/logline.rb +27 -0
  28. data/examples/size.rb +3 -2
  29. data/examples/{binning_percentile_estimator.rb → stats/binning_percentile_estimator.rb} +9 -11
  30. data/examples/{rank_and_bin.rb → stats/rank_and_bin.rb} +2 -2
  31. data/examples/stupidly_simple_filter.rb +11 -14
  32. data/examples/word_count.rb +16 -36
  33. data/lib/wukong/and_pig.rb +2 -15
  34. data/lib/wukong/logger.rb +7 -28
  35. data/lib/wukong/periodic_monitor.rb +24 -9
  36. data/lib/wukong/script/emr_command.rb +1 -0
  37. data/lib/wukong/script/hadoop_command.rb +31 -29
  38. data/lib/wukong/script.rb +19 -14
  39. data/lib/wukong/store/cassandra_model.rb +2 -1
  40. data/lib/wukong/streamer/accumulating_reducer.rb +5 -9
  41. data/lib/wukong/streamer/base.rb +44 -3
  42. data/lib/wukong/streamer/counting_reducer.rb +12 -12
  43. data/lib/wukong/streamer/filter.rb +2 -2
  44. data/lib/wukong/streamer/list_reducer.rb +3 -3
  45. data/lib/wukong/streamer/reducer.rb +11 -0
  46. data/lib/wukong/streamer.rb +7 -3
  47. data/lib/wukong.rb +7 -3
  48. data/{examples → old}/cassandra_streaming/berlitz_for_cassandra.textile +0 -0
  49. data/{examples → old}/cassandra_streaming/client_interface_notes.textile +0 -0
  50. data/{examples → old}/cassandra_streaming/client_schema.textile +0 -0
  51. data/{examples → old}/cassandra_streaming/tuning.textile +0 -0
  52. data/wukong.gemspec +257 -285
  53. metadata +45 -62
  54. data/examples/cassandra_streaming/avromapper.rb +0 -85
  55. data/examples/cassandra_streaming/cassandra.avpr +0 -468
  56. data/examples/cassandra_streaming/cassandra_random_partitioner.rb +0 -62
  57. data/examples/cassandra_streaming/catter.sh +0 -45
  58. data/examples/cassandra_streaming/client_schema.avpr +0 -211
  59. data/examples/cassandra_streaming/foofile.avr +0 -0
  60. data/examples/cassandra_streaming/pymap.sh +0 -1
  61. data/examples/cassandra_streaming/pyreduce.sh +0 -1
  62. data/examples/cassandra_streaming/smutation.avpr +0 -188
  63. data/examples/cassandra_streaming/streamer.sh +0 -51
  64. data/examples/cassandra_streaming/struct_loader.rb +0 -24
  65. data/examples/count_keys.rb +0 -56
  66. data/examples/count_keys_at_mapper.rb +0 -57
  67. data/examples/emr/README-elastic_map_reduce.textile +0 -26
  68. data/examples/keystore/cassandra_batch_test.rb +0 -41
  69. data/examples/keystore/conditional_outputter_example.rb +0 -70
  70. data/examples/store/chunked_store_example.rb +0 -18
  71. data/lib/wukong/dfs.rb +0 -81
  72. data/lib/wukong/keystore/cassandra_conditional_outputter.rb +0 -122
  73. data/lib/wukong/keystore/redis_db.rb +0 -24
  74. data/lib/wukong/keystore/tyrant_db.rb +0 -137
  75. data/lib/wukong/keystore/tyrant_notes.textile +0 -145
  76. data/lib/wukong/models/graph.rb +0 -25
  77. data/lib/wukong/monitor/chunked_store.rb +0 -23
  78. data/lib/wukong/monitor/periodic_logger.rb +0 -34
  79. data/lib/wukong/monitor/periodic_monitor.rb +0 -70
  80. data/lib/wukong/monitor.rb +0 -7
  81. data/lib/wukong/rdf.rb +0 -104
  82. data/lib/wukong/streamer/cassandra_streamer.rb +0 -61
  83. data/lib/wukong/streamer/count_keys.rb +0 -30
  84. data/lib/wukong/streamer/count_lines.rb +0 -26
  85. data/lib/wukong/streamer/em_streamer.rb +0 -7
  86. data/lib/wukong/streamer/preprocess_with_pipe_streamer.rb +0 -22
  87. data/lib/wukong/wukong_class.rb +0 -21
@@ -1,211 +0,0 @@
1
- {
2
- "protocol" : "Cassandra",
3
- "namespace" : "org.apache.cassandra.avro", "types" : [
4
-
5
- Add/insert one value
6
-
7
- Mutate ks, [col_ref], 'val', ts, ttl }
8
-
9
- Add/insert multiple cols to same row
10
-
11
- MutateRow ks, supercol_or_nil, { [col, val, ts, ttl], [col,val,ts,ttl],...}}
12
- MutateCRow ks, { [col, val, ts, ttl], [col,val,ts,ttl],...}}
13
- MutateSCRow ks, supercol, { [col, val, ts, ttl], [col,val,ts,ttl],...}}
14
-
15
- Get one, many or all columns from given row
16
-
17
- get
18
- Multiget ks, supercol_or_nil, [col1, col2, ...] or nil
19
-
20
- Get one, many or all columns from a slice of sequential rows
21
-
22
- get_range
23
-
24
- Remove one column from a row
25
-
26
- remove
27
-
28
- Remove many columns from a row
29
-
30
- Remove all columns in a row
31
-
32
-
33
-
34
- h3. Mo
35
-
36
-
37
- { "name" : "AccessLevel", "type" : "enum", "symbols" : [ "NONE", "READONLY", "READWRITE", "FALL" ] },
38
- { "name" : "ColumnPath", "type" : "record", "fields" : [
39
- { "name" : "column_family", "type" : "string"},
40
- { "name" : "super_column", "type" : [ "bytes", "null" ]},
41
- { "name" : "column", "type" : [ "bytes", "null" ] } ]},
42
- { "name" : "ColumnParent", "type" : "record", "fields" : [
43
- { "name" : "column_family", "type" : "string"},
44
- { "name" : "super_column", "type" : [ "bytes", "null" ] } ]},
45
- { "name" : "SliceRange", "type" : "record", "fields" : [
46
- { "name" : "start", "type" : "bytes"},
47
- { "name" : "finish", "type" : "bytes"},
48
- { "name" : "reversed", "type" : "boolean"},
49
- { "name" : "count", "type" : "int"},
50
- { "name" : "bitmasks", "type" : [ { "type" : "array", "items" : "bytes"}, "null" ] } ]},
51
- { "name" : "SlicePredicate", "type" : "record", "fields" : [
52
- { "name" : "column_names", "type" : [ { "type" : "array", "items" : "bytes"}, "null" ]},
53
- { "name" : "slice_range", "type" : [ "SliceRange", "null" ] } ]},
54
-
55
- { "name" : "Clock", "type" : "record", "fields" : [
56
- { "name" : "timestamp", "type" : "long" } ]},
57
- { "name" : "Column", "type" : "record", "fields" : [
58
- { "name" : "name", "type" : "bytes"},
59
- { "name" : "value", "type" : "bytes"},
60
- { "name" : "clock", "type" : "Clock"},
61
- { "name" : "ttl", "type" : "int" } ]},
62
- { "name" : "SuperColumn", "type" : "record", "fields" : [
63
- { "name" : "name", "type" : "bytes"},
64
- { "name" : "columns", "type" : { "type" : "array", "items" : "Column" } } ]},
65
- { "name" : "ColumnOrSuperColumn", "type" : "record", "fields" : [
66
- { "name" : "column", "type" : "Column" },
67
- { "name" : "super_column", "type" : "null" } ]},
68
- { "name" : "Deletion", "type" : "record", "fields" : [
69
- { "name" : "clock", "type" : "Clock"},
70
- { "name" : "super_column", "type" : [ "bytes", "null" ]},
71
- { "name" : "predicate", "type" : [ "SlicePredicate", "null" ] } ]},
72
- { "name" : "Mutation", "type" : "record", "fields" : [
73
- { "name" : "column_or_supercolumn", "type" : "ColumnOrSuperColumn" },
74
- { "name" : "deletion", "type" : "null" }
75
- ]},
76
- { "name" : "StreamingMutation", "type" : "record", "fields" : [
77
- { "name" : "key", "type" : "bytes" },
78
- { "name" : "mutation", "type" : "Mutation" } ]},
79
-
80
- { "name" : "IndexType", "type" : "enum", "symbols" : [ "KEYS" ]},
81
- { "name" : "ColumnDef", "type" : "record", "fields" : [
82
- { "name" : "name", "type" : "bytes"},
83
- { "name" : "validation_class", "type" : "string"},
84
- { "name" : "index_type", "type" : [ "IndexType", "null" ]},
85
- { "name" : "index_name", "type" : [ "string", "null" ] } ]},
86
- { "name" : "CfDef", "type" : "record", "fields" : [
87
- { "name" : "keyspace", "type" : "string"},
88
- { "name" : "name", "type" : "string"},
89
- { "name" : "column_type", "type" : [ "string", "null" ]},
90
- { "name" : "clock_type", "type" : [ "string", "null" ]},
91
- { "name" : "comparator_type", "type" : [ "string", "null" ]},
92
- { "name" : "subcomparator_type", "type" : [ "string", "null" ]},
93
- { "name" : "reconciler", "type" : [ "string", "null" ]},
94
- { "name" : "comment", "type" : [ "string", "null" ]},
95
- { "name" : "row_cache_size", "type" : [ "double", "null" ]},
96
- { "name" : "preload_row_cache", "type" : [ "boolean", "null" ]},
97
- { "name" : "key_cache_size", "type" : [ "double", "null" ]},
98
- { "name" : "read_repair_chance", "type" : [ "double", "null" ]},
99
- { "name" : "gc_grace_seconds", "type" : [ "int", "null" ]},
100
- { "name" : "column_metadata", "type" : [ { "type" : "array", "items" : "ColumnDef"}, "null" ]},
101
- { "name" : "id", "type" : [ "int", "null" ] } ]},
102
- { "name" : "KsDef", "type" : "record", "fields" : [
103
- { "name" : "name", "type" : "string"}, { "name" : "strategy_class", "type" : "string"},
104
- { "name" : "strategy_options", "type" : [ { "type" : "map", "values" : "string"}, "null" ]},
105
- { "name" : "replication_factor", "type" : "int"}, { "name" : "cf_defs", "type" : { "type" : "array", "items" : "CfDef" } } ]},
106
- { "name" : "MutationsMapEntry", "type" : "record", "fields" : [ { "name" : "key", "type" : "bytes"}, { "name" : "mutations", "type" : { "type" : "map", "values" : { "type" : "array", "items" : "Mutation" } } } ]},
107
- { "name" : "CoscsMapEntry", "type" : "record", "fields" : [ { "name" : "key", "type" : "bytes"}, { "name" : "columns", "type" : { "type" : "array", "items" : "ColumnOrSuperColumn" } } ]},
108
- { "name" : "ConsistencyLevel", "type" : "enum", "symbols" : [ "ZERO", "ONE", "QUORUM", "DCQUORUM", "DCQUORUMSYNC", "ALL" ]},
109
- { "name" : "InvalidRequestException", "type" : "error", "fields" : [ { "name" : "why", "type" : [ "string", "null" ] } ]},
110
- { "name" : "NotFoundException", "type" : "error", "fields" : [ { "name" : "why", "type" : [ "string", "null" ] } ]},
111
- { "name" : "UnavailableException", "type" : "error", "fields" : [ { "name" : "why", "type" : [ "string", "null" ] } ]},
112
- { "name" : "TimedOutException", "type" : "error", "fields" : [ { "name" : "why", "type" : [ "string", "null" ] } ] }
113
- ],
114
-
115
-
116
- "messages" : { "get" : {
117
- "request" : [ { "name" : "key", "type" : "bytes"},
118
- { "name" : "column_path", "type" : "ColumnPath"},
119
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
120
- } ],
121
- "response" : "ColumnOrSuperColumn",
122
- "errors" : [ "InvalidRequestException", "NotFoundException", "UnavailableException", "TimedOutException" ]
123
- },
124
- "get_slice" : {
125
- "request" : [ { "name" : "key", "type" : "bytes"},
126
- { "name" : "column_parent", "type" : "ColumnParent"},
127
- { "name" : "predicate", "type" : "SlicePredicate"},
128
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
129
- } ],
130
- "response" : { "type" : "array",
131
- "items" : "ColumnOrSuperColumn"
132
- },
133
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
134
- },
135
- "multiget_slice" : {
136
- "request" : [ { "name" : "keys", "type" : { "type" : "array",
137
- "items" : "bytes"
138
- }},
139
- { "name" : "column_parent", "type" : "ColumnParent"},
140
- { "name" : "predicate", "type" : "SlicePredicate"},
141
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
142
- } ],
143
- "response" : { "type" : "array",
144
- "items" : "CoscsMapEntry"
145
- },
146
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
147
- },
148
- "get_count" : {
149
- "request" : [ { "name" : "key", "type" : "bytes"},
150
- { "name" : "column_parent", "type" : "ColumnParent"},
151
- { "name" : "predicate", "type" : "SlicePredicate"},
152
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
153
- } ],
154
- "response" : "int",
155
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
156
- },
157
- "insert" : {
158
- "request" : [ { "name" : "key", "type" : "bytes"},
159
- { "name" : "column_parent", "type" : "ColumnParent"},
160
- { "name" : "column", "type" : "Column"},
161
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
162
- } ],
163
- "response" : "null",
164
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
165
- },
166
- "remove" : {
167
- "request" : [ { "name" : "key", "type" : "bytes"},
168
- { "name" : "column_path", "type" : "ColumnPath"},
169
- { "name" : "clock", "type" : "Clock"},
170
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
171
- } ],
172
- "response" : "null",
173
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
174
- },
175
- "batch_mutate" : {
176
- "request" : [ { "name" : "mutation_map", "type" : { "type" : "array",
177
- "items" : "MutationsMapEntry"
178
- }},
179
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
180
- } ],
181
- "response" : "null",
182
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
183
- },
184
- "system_add_keyspace" : {
185
- "request" : [ { "name" : "ks_def", "type" : "KsDef"
186
- } ],
187
- "response" : "null",
188
- "errors" : [ "InvalidRequestException" ]
189
- },
190
- "set_keyspace" : {
191
- "request" : [ { "name" : "keyspace", "type" : "string"
192
- } ],
193
- "response" : "null",
194
- "errors" : [ "InvalidRequestException" ]
195
- },
196
- "describe_keyspaces" : {
197
- "request" : [ ],
198
- "response" : { "type" : "array",
199
- "items" : "string"
200
- }
201
- },
202
- "describe_cluster_name" : {
203
- "request" : [ ],
204
- "response" : "string"
205
- },
206
- "describe_version" : {
207
- "request" : [ ],
208
- "response" : "string"
209
- }
210
- }
211
- }
@@ -1 +0,0 @@
1
- python /usr/local/share/cassandra/contrib/hadoop_streaming_output/bin/mapper.py
@@ -1 +0,0 @@
1
- python /usr/local/share/cassandra/contrib/hadoop_streaming_output/bin/reducer.py
@@ -1,188 +0,0 @@
1
- {
2
- "protocol" : "Cassandra",
3
- "namespace" : "org.apache.cassandra.avro", "types" : [
4
-
5
- { "name" : "AccessLevel", "type" : "enum", "symbols" : [ "NONE", "READONLY", "READWRITE", "FALL" ] },
6
- { "name" : "ColumnPath", "type" : "record", "fields" : [
7
- { "name" : "column_family", "type" : "string"},
8
- { "name" : "super_column", "type" : [ "bytes", "null" ]},
9
- { "name" : "column", "type" : [ "bytes", "null" ] } ]},
10
- { "name" : "ColumnParent", "type" : "record", "fields" : [
11
- { "name" : "column_family", "type" : "string"},
12
- { "name" : "super_column", "type" : [ "bytes", "null" ] } ]},
13
- { "name" : "SliceRange", "type" : "record", "fields" : [
14
- { "name" : "start", "type" : "bytes"},
15
- { "name" : "finish", "type" : "bytes"},
16
- { "name" : "reversed", "type" : "boolean"},
17
- { "name" : "count", "type" : "int"},
18
- { "name" : "bitmasks", "type" : [ { "type" : "array", "items" : "bytes"}, "null" ] } ]},
19
- { "name" : "SlicePredicate", "type" : "record", "fields" : [
20
- { "name" : "column_names", "type" : [ { "type" : "array", "items" : "bytes"}, "null" ]},
21
- { "name" : "slice_range", "type" : [ "SliceRange", "null" ] } ]},
22
-
23
- { "name": "StupidColumnMutation", "type": "record",
24
- "fields": [
25
- { "name" : "key", "type" : "bytes" },
26
- { "name" : "name", "type" : "bytes" },
27
- { "name" : "value", "type" : "bytes" },
28
- { "name" : "timestamp", "type" : "long" },
29
- { "name" : "ttl", "type" : "int" }
30
- ]},
31
-
32
- { "name" : "Clock", "type" : "record", "fields" : [
33
- { "name" : "timestamp", "type" : "long" } ]},
34
- { "name" : "Column", "type" : "record", "fields" : [
35
- { "name" : "name", "type" : "bytes"},
36
- { "name" : "value", "type" : "bytes"},
37
- { "name" : "clock", "type" : "Clock"},
38
- { "name" : "ttl", "type" : "int" } ]},
39
- { "name" : "SuperColumn", "type" : "record", "fields" : [
40
- { "name" : "name", "type" : "bytes"},
41
- { "name" : "columns", "type" : { "type" : "array", "items" : "Column" } } ]},
42
- { "name" : "ColumnOrSuperColumn", "type" : "record", "fields" : [
43
- { "name" : "column", "type" : "Column" },
44
- { "name" : "super_column", "type" : "null" } ]},
45
- { "name" : "Deletion", "type" : "record", "fields" : [
46
- { "name" : "clock", "type" : "Clock"},
47
- { "name" : "super_column", "type" : [ "bytes", "null" ]},
48
- { "name" : "predicate", "type" : [ "SlicePredicate", "null" ] } ]},
49
- { "name" : "Mutation", "type" : "record", "fields" : [
50
- { "name" : "column_or_supercolumn", "type" : "ColumnOrSuperColumn" },
51
- { "name" : "deletion", "type" : "null" }
52
- ]},
53
- { "name" : "StreamingMutation", "type" : "record", "fields" : [
54
- { "name" : "key", "type" : "bytes" },
55
- { "name" : "mutation", "type" : "Mutation" } ]},
56
-
57
- { "name" : "IndexType", "type" : "enum", "symbols" : [ "KEYS" ]},
58
- { "name" : "ColumnDef", "type" : "record", "fields" : [
59
- { "name" : "name", "type" : "bytes"},
60
- { "name" : "validation_class", "type" : "string"},
61
- { "name" : "index_type", "type" : [ "IndexType", "null" ]},
62
- { "name" : "index_name", "type" : [ "string", "null" ] } ]},
63
- { "name" : "CfDef", "type" : "record", "fields" : [
64
- { "name" : "keyspace", "type" : "string"},
65
- { "name" : "name", "type" : "string"},
66
- { "name" : "column_type", "type" : [ "string", "null" ]},
67
- { "name" : "clock_type", "type" : [ "string", "null" ]},
68
- { "name" : "comparator_type", "type" : [ "string", "null" ]},
69
- { "name" : "subcomparator_type", "type" : [ "string", "null" ]},
70
- { "name" : "reconciler", "type" : [ "string", "null" ]},
71
- { "name" : "comment", "type" : [ "string", "null" ]},
72
- { "name" : "row_cache_size", "type" : [ "double", "null" ]},
73
- { "name" : "preload_row_cache", "type" : [ "boolean", "null" ]},
74
- { "name" : "key_cache_size", "type" : [ "double", "null" ]},
75
- { "name" : "read_repair_chance", "type" : [ "double", "null" ]},
76
- { "name" : "gc_grace_seconds", "type" : [ "int", "null" ]},
77
- { "name" : "column_metadata", "type" : [ { "type" : "array", "items" : "ColumnDef"}, "null" ]},
78
- { "name" : "id", "type" : [ "int", "null" ] } ]},
79
- { "name" : "KsDef", "type" : "record", "fields" : [
80
- { "name" : "name", "type" : "string"}, { "name" : "strategy_class", "type" : "string"},
81
- { "name" : "strategy_options", "type" : [ { "type" : "map", "values" : "string"}, "null" ]},
82
- { "name" : "replication_factor", "type" : "int"}, { "name" : "cf_defs", "type" : { "type" : "array", "items" : "CfDef" } } ]},
83
- { "name" : "MutationsMapEntry", "type" : "record", "fields" : [ { "name" : "key", "type" : "bytes"}, { "name" : "mutations", "type" : { "type" : "map", "values" : { "type" : "array", "items" : "Mutation" } } } ]},
84
- { "name" : "CoscsMapEntry", "type" : "record", "fields" : [ { "name" : "key", "type" : "bytes"}, { "name" : "columns", "type" : { "type" : "array", "items" : "ColumnOrSuperColumn" } } ]},
85
- { "name" : "ConsistencyLevel", "type" : "enum", "symbols" : [ "ZERO", "ONE", "QUORUM", "DCQUORUM", "DCQUORUMSYNC", "ALL" ]},
86
- { "name" : "InvalidRequestException", "type" : "error", "fields" : [ { "name" : "why", "type" : [ "string", "null" ] } ]},
87
- { "name" : "NotFoundException", "type" : "error", "fields" : [ { "name" : "why", "type" : [ "string", "null" ] } ]},
88
- { "name" : "UnavailableException", "type" : "error", "fields" : [ { "name" : "why", "type" : [ "string", "null" ] } ]},
89
- { "name" : "TimedOutException", "type" : "error", "fields" : [ { "name" : "why", "type" : [ "string", "null" ] } ] }
90
- ],
91
-
92
-
93
- "messages" : { "get" : {
94
- "request" : [ { "name" : "key", "type" : "bytes"},
95
- { "name" : "column_path", "type" : "ColumnPath"},
96
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
97
- } ],
98
- "response" : "ColumnOrSuperColumn",
99
- "errors" : [ "InvalidRequestException", "NotFoundException", "UnavailableException", "TimedOutException" ]
100
- },
101
- "get_slice" : {
102
- "request" : [ { "name" : "key", "type" : "bytes"},
103
- { "name" : "column_parent", "type" : "ColumnParent"},
104
- { "name" : "predicate", "type" : "SlicePredicate"},
105
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
106
- } ],
107
- "response" : { "type" : "array",
108
- "items" : "ColumnOrSuperColumn"
109
- },
110
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
111
- },
112
- "multiget_slice" : {
113
- "request" : [ { "name" : "keys", "type" : { "type" : "array",
114
- "items" : "bytes"
115
- }},
116
- { "name" : "column_parent", "type" : "ColumnParent"},
117
- { "name" : "predicate", "type" : "SlicePredicate"},
118
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
119
- } ],
120
- "response" : { "type" : "array",
121
- "items" : "CoscsMapEntry"
122
- },
123
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
124
- },
125
- "get_count" : {
126
- "request" : [ { "name" : "key", "type" : "bytes"},
127
- { "name" : "column_parent", "type" : "ColumnParent"},
128
- { "name" : "predicate", "type" : "SlicePredicate"},
129
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
130
- } ],
131
- "response" : "int",
132
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
133
- },
134
- "insert" : {
135
- "request" : [ { "name" : "key", "type" : "bytes"},
136
- { "name" : "column_parent", "type" : "ColumnParent"},
137
- { "name" : "column", "type" : "Column"},
138
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
139
- } ],
140
- "response" : "null",
141
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
142
- },
143
- "remove" : {
144
- "request" : [ { "name" : "key", "type" : "bytes"},
145
- { "name" : "column_path", "type" : "ColumnPath"},
146
- { "name" : "clock", "type" : "Clock"},
147
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
148
- } ],
149
- "response" : "null",
150
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
151
- },
152
- "batch_mutate" : {
153
- "request" : [ { "name" : "mutation_map", "type" : { "type" : "array",
154
- "items" : "MutationsMapEntry"
155
- }},
156
- { "name" : "consistency_level", "type" : "ConsistencyLevel"
157
- } ],
158
- "response" : "null",
159
- "errors" : [ "InvalidRequestException", "UnavailableException", "TimedOutException" ]
160
- },
161
- "system_add_keyspace" : {
162
- "request" : [ { "name" : "ks_def", "type" : "KsDef"
163
- } ],
164
- "response" : "null",
165
- "errors" : [ "InvalidRequestException" ]
166
- },
167
- "set_keyspace" : {
168
- "request" : [ { "name" : "keyspace", "type" : "string"
169
- } ],
170
- "response" : "null",
171
- "errors" : [ "InvalidRequestException" ]
172
- },
173
- "describe_keyspaces" : {
174
- "request" : [ ],
175
- "response" : { "type" : "array",
176
- "items" : "string"
177
- }
178
- },
179
- "describe_cluster_name" : {
180
- "request" : [ ],
181
- "response" : "string"
182
- },
183
- "describe_version" : {
184
- "request" : [ ],
185
- "response" : "string"
186
- }
187
- }
188
- }
@@ -1,51 +0,0 @@
1
- #!/usr/bin/env bash
2
-
3
- input_file="$1" ; shift
4
- output_file="$1" ; shift
5
- map_script=${1-/bin/cat} ; shift
6
- reduce_script=${1-/usr/bin/uniq} ; shift
7
-
8
- dest_keyspace=${dest_keyspace-soc_net_tw}
9
- dest_col_family=${dest_col_family-Wordbag}
10
-
11
- hostname=`hostname`
12
-
13
- # Path to cassandra and hadoop dirs
14
- script_dir=$(readlink -f `dirname $0`)
15
- CASSANDRA_HOME=${CASSANDRA_HOME-/usr/local/share/cassandra}
16
- HADOOP_HOME=${HADOOP_HOME-/usr/lib/hadoop}
17
- avro_file=${avro_file-$CASSANDRA_HOME/interface/avro/cassandra.avpr}
18
-
19
- ARCHIVES=`/bin/ls -1 $CASSANDRA_HOME/build/apache-cassandra*.jar`
20
- for jar in `/bin/ls -1 $CASSANDRA_HOME/build/lib/jars/*.jar $CASSANDRA_HOME/lib/*.jar`; do
21
- ARCHIVES=$ARCHIVES,$jar
22
- done
23
-
24
- ${HADOOP_HOME}/bin/hadoop \
25
- jar ${HADOOP_HOME}/contrib/streaming/hadoop-*streaming*.jar \
26
- -D stream.map.output=cassandra_avro_output \
27
- -D stream.io.identifier.resolver.class=org.apache.cassandra.hadoop.streaming.AvroResolver \
28
- -D cassandra.output.keyspace="$dest_keyspace" \
29
- -D cassandra.output.columnfamily="$dest_col_family" \
30
- -D cassandra.thrift.address=10.204.41.193,10.204.30.11,10.204.58.238,10.204.239.133,10.196.191.31,10.204.103.21,10.202.74.223,10.202.143.95 \
31
- -D cassandra.partitioner.class=org.apache.cassandra.dht.RandomPartitioner \
32
- -D cassandra.thrift.port=9160 \
33
- -D mapreduce.output.columnfamilyoutputformat.batch.threshold=1024 \
34
- -D mapred.reduce.tasks=0 \
35
- -D mapred.map.tasks.speculative.execution=false \
36
- -libjars $ARCHIVES \
37
- -file $avro_file \
38
- -outputformat org.apache.cassandra.hadoop.ColumnFamilyOutputFormat \
39
- -mapper "ruby $script_dir/avromapper.rb --map " \
40
- -input "$input_file" \
41
- -output "$output_file" \
42
- "$@"
43
-
44
- # -D cassandra.thrift.address=10.204.54.190,10.244.42.31,10.244.42.176,10.244.42.112,10.244.42.143,10.244.42.79,10.244.42.4,10.204.53.166 \
45
- # -D cassandra.thrift.address=10.204.221.230,10.243.79.223,10.245.19.159,10.242.154.159,10.242.153.155,10.242.153.203 \
46
-
47
-
48
- # cat /tmp/mj-flip/chimchim-info.log | cut -f5 | ruby -e 'puts $stdin.readlines.map{|l| l.chomp.gsub(/ip-([0-9\-]+)\..*/,"\\1").gsub(/-/,".") }.join(",")'
49
-
50
-
51
-
@@ -1,24 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require 'rubygems'
3
- require 'wukong'
4
- require 'wukong/periodic_monitor'
5
- require 'wukong/store/cassandra'
6
- require 'wukong/script/cassandra_loader_script'
7
-
8
- Settings.use :commandline
9
- Settings.define :log_interval, :default => 1
10
- Settings.cassandra_keyspace = 'soc_net_tw'
11
- Settings.cassandra_col_family = 'TwitterUser'
12
- Settings.cassandra_hosts = "ip-10-204-41-193.ec2.internal:9160,ip-10-204-30-11.ec2.internal:9160,ip-10-204-58-238.ec2.internal:9160,ip-10-204-239-133.ec2.internal:9160,ip-10-196-191-31.ec2.internal:9160,ip-10-204-103-21.ec2.internal:9160,ip-10-202-74-223.ec2.internal:9160,ip-10-202-143-95.ec2.internal:9160"
13
- Settings.resolve!
14
-
15
- require 'cassandra/0.7'
16
- require 'wuclan/twitter' ; include Wuclan::Twitter
17
- require 'wuclan/twitter/cassandra_db'
18
- require 'wukong/store/cassandra/streaming'
19
-
20
- # hdp-catd s3://s3hdfs.infinitemonkeys.info/data/sn/tw/fixd/objects/twitter_user | head
21
-
22
- # CassandraScript.new(Wukong::Store::Cassandra::StructLoader, nil).run
23
- Wukong::CassandraScript.new(Wukong::Store::Cassandra::StructLoader, nil).run
24
-
@@ -1,56 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
3
- require 'wukong'
4
- require 'wukong/streamer/count_keys'
5
- require 'wukong/streamer/count_lines'
6
-
7
- #
8
- #
9
- class CountKeysReducer < Wukong::Streamer::CountLines
10
- #
11
- # Taken from the actionpack Rails component ('action_view/helpers/number_helper')
12
- #
13
- # Formats a +number+ with grouped thousands using +delimiter+. You
14
- # can customize the format using optional <em>delimiter</em> and <em>separator</em> parameters.
15
- # * <tt>delimiter</tt> - Sets the thousands delimiter, defaults to ","
16
- # * <tt>separator</tt> - Sets the separator between the units, defaults to "."
17
- #
18
- # number_with_delimiter(12345678) => 12,345,678
19
- # number_with_delimiter(12345678.05) => 12,345,678.05
20
- # number_with_delimiter(12345678, ".") => 12.345.678
21
- def number_with_delimiter(number, delimiter=",", separator=".")
22
- begin
23
- parts = number.to_s.split('.')
24
- parts[0].gsub!(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1#{delimiter}")
25
- parts.join separator
26
- rescue
27
- number
28
- end
29
- end
30
-
31
- # Override to look nice
32
- def formatted_count item, key_count
33
- key_count_str = number_with_delimiter(key_count.to_i)
34
- "%-25s\t%12s" % [item, key_count_str]
35
- end
36
- end
37
-
38
- #
39
- class CountKeysScript < Wukong::Script
40
- def map_command
41
- # Use `cut` to extract the first field
42
- %Q{ cut -d"\t" -f1 }
43
- end
44
-
45
- #
46
- # There's just the one field
47
- #
48
- def default_options
49
- super.merge :sort_fields => 1
50
- end
51
- end
52
-
53
- # Executes the script when run from command line
54
- if __FILE__ == $0
55
- CountKeysScript.new(nil, CountKeysReducer).run
56
- end
@@ -1,57 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../lib'
3
- require 'wukong'
4
-
5
- #
6
- #
7
- module CountKeys
8
- #
9
- class Mapper < Wukong::Streamer::Base
10
- attr_accessor :keys_count
11
- def initialize *args
12
- self.keys_count = {}
13
- end
14
- def process key, *args
15
- key.gsub!(/-.*/, '') # kill off the slug
16
- self.keys_count[key] ||= 0
17
- self.keys_count[key] += 1
18
- end
19
- def stream *args
20
- super *args
21
- self.keys_count.each do |key, count|
22
- emit [key, count].to_flat
23
- end
24
- end
25
- end
26
- # Identity Mapper
27
- class Reducer < Wukong::Streamer::AccumulatingReducer
28
- attr_accessor :key_count
29
- require 'active_support'
30
- require 'action_view/helpers/number_helper'; include ActionView::Helpers::NumberHelper
31
-
32
- # Override to look nice
33
- def formatted_count item, key_count
34
- key_count_str = number_with_delimiter(key_count.to_i, :delimiter => ',')
35
- "%-25s\t%12s" % [item, key_count_str]
36
- end
37
- def start! *args
38
- self.key_count = 0
39
- end
40
- def accumulate key, count
41
- self.key_count += count.to_i
42
- end
43
- def finalize
44
- yield formatted_count(key, key_count)
45
- end
46
- end
47
-
48
- #
49
- class Script < Wukong::Script
50
- # There's just the one field
51
- def default_options
52
- super.merge :sort_fields => 1, :reduce_tasks => 1
53
- end
54
- end
55
- end
56
-
57
- CountKeys::Script.new(CountKeys::Mapper, CountKeys::Reducer).run
@@ -1,26 +0,0 @@
1
-
2
- # Download the Amazon elastic-mapreduce runner from http://elasticmapreduce.s3.amazonaws.com/elastic-mapreduce-ruby.zip
3
-
4
- # Create a bucket and path to hold your EMR logs, scripts and other ephemera. For instance you might choose 'emr.yourdomain.com' as the bucket and '/wukong' as a scoping path within that bucket. In that case you will refer to it with a path like s3n://emr.yourdomain.com/wukong (see notes below about s3n:// vs. s3:// URLs).
5
-
6
- # Copy the contents of wukong/examples/emr/dot_wukong_dir to ~/.wukong
7
- # Edit emr.yaml -- it has instructions for the
8
-
9
-
10
-
11
-
12
-
13
- h3. s3n:// vs. s3:// URLs
14
-
15
- Many external tools use a URI convention to address files in S3; they typically use the 's3://' scheme, which makes a lot of sense:
16
- s3://emr.yourcompany.com/wukong/happy_job_1/logs/whatever-20100808.log
17
-
18
- Hadoop can maintain an HDFS on the Amazon S3: it uses a block structure and has optimizations for streaming, no file size limitation, and other goodness. However, only hadoop tools can interpret the contents of those blocks -- to everything else it just looks like a soup of blocks labelled block_-8675309 and so forth. Hadoop unfortunately chose the 's3://' scheme for URIs in this filesystem:
19
- s3://s3hdfs.yourcompany.com/path/to/data
20
-
21
- Hadoop is happy to read s3 native files -- 'native' as in, you can look at them with a browser and upload them an download them with any S3 tool out there. There's a 5GB limit on file size, and in some cases a performance hit (but not in our experience enough to worry about). You refer to these files with the 's3n://' scheme ('n' as in 'native'):
22
- s3n://emr.yourcompany.com/wukong/happy_job_1/code/happy_job_1-mapper.rb
23
- s3n://emr.yourcompany.com/wukong/happy_job_1/code/happy_job_1-reducer.rb
24
- s3n://emr.yourcompany.com/wukong/happy_job_1/logs/whatever-20100808.log
25
-
26
- Wukong will coerce things to the right scheme when it knows what that scheme should be (eg. code should be s3n://). It will otherwise leave the path alone. Specifically, if you use a URI scheme for input and output paths you must use 's3n://' for normal s3 files.