wukong 1.4.11 → 1.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/hdp-rm CHANGED
@@ -14,9 +14,9 @@ if [ "$1" == "-r" ] ; then
14
14
  shift
15
15
  hadoop fs -test -e "$@"
16
16
  if [ "$?" == "0" ] ; then
17
- # echo "File exists, skipping trash, removing it..."
18
- echo hadoop dfs -rmr "$@"
19
- exec hadoop dfs -rmr "$@"
17
+ # echo "File exists, skipping trash, removing it..."
18
+ echo hadoop dfs -rmr -skipTrash "$@"
19
+ exec hadoop dfs -rmr -skipTrash "$@"
20
20
  fi
21
21
  else
22
22
  hadoop fs -test -e "$@"
@@ -2,10 +2,16 @@ module Enumerable
2
2
  #
3
3
  # Convert an array of values to a string representing it as a pig tuple
4
4
  #
5
+ # def to_pig_tuple
6
+ # map{|*vals| '(' + vals.join(',') + ')' }
7
+ # end
8
+
9
+ #
10
+ # Convert an array to a pig tuple
11
+ #
5
12
  def to_pig_tuple
6
- map{|*vals| '(' + vals.join(',') + ')' }
13
+ '(' + self.join(',') + ')'
7
14
  end
8
-
9
15
  #
10
16
  # Convert an array of values to a string pig format
11
17
  # Delegates to to_pig_tuple -- see also to_pig_bag
@@ -17,7 +23,29 @@ module Enumerable
17
23
  #
18
24
  # Convert an array of values to a string representing it as a pig bag
19
25
  #
26
+ # def to_pig_bag
27
+ # '{' + self.join(',') + '}'
28
+ # end
29
+
30
+ #
31
+ # Convert and array of values to a string representing it as a pig bag
32
+ #
20
33
  def to_pig_bag
21
- '{' + self.join(',') + '}'
34
+ '{' + self.map{|*vals| vals.to_pig_tuple}.join(",") + '}'
35
+ end
36
+
37
+ #
38
+ # Convert a string representing a pig bag into a nested array
39
+ #
40
+ def from_pig_bag
41
+ self.split("),(").map{|t| t.gsub(/[\{\}]/, '').from_pig_tuple} rescue []
42
+ end
43
+
44
+ #
45
+ # Convert a string representing a pig tuple into an array
46
+ #
47
+ def from_pig_tuple
48
+ self.gsub(/[\(\)]/, '').split(',')
22
49
  end
50
+
23
51
  end
@@ -1,21 +1,23 @@
1
1
  require 'time'
2
+ require 'date'
2
3
  DateTime.class_eval do
3
4
  #
4
5
  # Parses the time but never fails.
5
6
  # Return value is always in the UTC time zone.
6
7
  #
7
- # A flattened time -- a 12-digit YYYYmmddHHMMMSS -- is treated as a UTC
8
- # datetime.
8
+ # A flattened datetime -- a 12-digit YYYYmmddHHMMMSS -- is fixed to the UTC
9
+ # time zone by parsing it as YYYYmmddHHMMMSSZ <- 'Z' at end
9
10
  #
10
11
  def self.parse_safely dt
12
+ return nil if dt.blank?
11
13
  begin
12
14
  if dt.to_s =~ /\A\d{12}Z?\z/
13
- parse(dt+'Z', true).utc
15
+ parse(dt+'Z', true)
14
16
  else
15
17
  parse(dt, true).utc
16
18
  end
17
- rescue StandardError
18
- nil
19
+ rescue StandardError => e
20
+ Log.info e
19
21
  end
20
22
  end
21
23
 
@@ -53,7 +53,6 @@ class TokyoTyrant::Balancer::Base
53
53
  def close
54
54
  @servers.all?{ |server| server.close rescue nil}
55
55
  end
56
-
57
56
  end
58
57
 
59
58
  module TokyoDbConnection
@@ -67,11 +66,27 @@ module TokyoDbConnection
67
66
  ].freeze unless defined?(TokyoDbConnection::TyrantDb::DB_SERVERS)
68
67
 
69
68
  DB_PORTS = {
70
- :user_ids => 12001,
71
- :screen_names => 12002,
72
- :search_ids => 12003,
73
- :tweets_parsed => 12004,
74
- :users_parsed => 12005,
69
+ :screen_names => 12002,
70
+ :search_ids => 12003,
71
+ #
72
+ :tw_user_info => 14000,
73
+ :tw_wordbag => 14101,
74
+ :tw_influence => 14102,
75
+ :tw_trstrank => 14103,
76
+ :tw_conversation => 14104,
77
+ #
78
+ :screen_names2 => 12004,
79
+ :search_ids2 => 12005,
80
+ #
81
+ :tw_user_info2 => 14200,
82
+ :tw_wordbag2 => 14201,
83
+ :tw_influence2 => 14202,
84
+ :tw_trstrank2 => 14203,
85
+ :tw_conversation2 => 14204,
86
+ :tw_strong_links2 => 14205,
87
+ :tw_word_stats2 => 14206,
88
+ #
89
+ :ip_geo_census => 14400,
75
90
  } unless defined?(TokyoDbConnection::TyrantDb::DB_PORTS)
76
91
 
77
92
  def initialize dataset
@@ -82,8 +97,6 @@ module TokyoDbConnection
82
97
  return @db if @db
83
98
  port = DB_PORTS[dataset] or raise "Don't know how to reach dataset #{dataset}"
84
99
  @db = TokyoTyrant::Balancer::DB.new(DB_SERVERS.map{|s| s+':'+port.to_s})
85
- # @db = TokyoTyrant::DB.new(DB_SERVERS.first, port.to_i)
86
- @db
87
100
  end
88
101
 
89
102
  def [](*args) ; db[*args] ; end
data/wukong.gemspec CHANGED
@@ -5,11 +5,11 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = %q{wukong}
8
- s.version = "1.4.11"
8
+ s.version = "1.4.12"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["Philip (flip) Kromer"]
12
- s.date = %q{2010-07-30}
12
+ s.date = %q{2010-08-03}
13
13
  s.description = %q{ Treat your dataset like a:
14
14
 
15
15
  * stream of lines when it's efficient to process by lines
@@ -245,31 +245,31 @@ Gem::Specification.new do |s|
245
245
  "spec/spec_helper.rb",
246
246
  "spec/wukong/encoding_spec.rb",
247
247
  "spec/wukong/script_spec.rb",
248
- "examples/binning_percentile_estimator.rb",
249
- "examples/contrib/jeans/normalize.rb",
250
- "examples/contrib/jeans/sizes.rb",
251
- "examples/corpus/words_to_bigrams.rb",
252
- "examples/count_keys.rb",
253
- "examples/count_keys_at_mapper.rb",
254
- "examples/keystore/cassandra_batch_test.rb",
255
- "examples/keystore/conditional_outputter_example.rb",
256
- "examples/network_graph/adjacency_list.rb",
257
- "examples/network_graph/breadth_first_search.rb",
258
- "examples/network_graph/gen_2paths.rb",
259
- "examples/network_graph/gen_multi_edge.rb",
260
- "examples/network_graph/gen_symmetric_links.rb",
261
248
  "examples/pagerank/pagerank.rb",
262
249
  "examples/pagerank/pagerank_initialize.rb",
263
- "examples/rank_and_bin.rb",
264
250
  "examples/sample_records.rb",
265
251
  "examples/server_logs/apache_log_parser.rb",
266
252
  "examples/server_logs/breadcrumbs.rb",
267
253
  "examples/server_logs/user_agent.rb",
254
+ "examples/corpus/words_to_bigrams.rb",
255
+ "examples/count_keys.rb",
256
+ "examples/rank_and_bin.rb",
257
+ "examples/binning_percentile_estimator.rb",
268
258
  "examples/size.rb",
269
- "examples/stats/avg_value_frequency.rb",
270
259
  "examples/store/chunked_store_example.rb",
260
+ "examples/network_graph/breadth_first_search.rb",
261
+ "examples/network_graph/gen_symmetric_links.rb",
262
+ "examples/network_graph/gen_multi_edge.rb",
263
+ "examples/network_graph/adjacency_list.rb",
264
+ "examples/network_graph/gen_2paths.rb",
265
+ "examples/keystore/cassandra_batch_test.rb",
266
+ "examples/keystore/conditional_outputter_example.rb",
267
+ "examples/stats/avg_value_frequency.rb",
268
+ "examples/contrib/jeans/sizes.rb",
269
+ "examples/contrib/jeans/normalize.rb",
270
+ "examples/word_count.rb",
271
271
  "examples/stupidly_simple_filter.rb",
272
- "examples/word_count.rb"
272
+ "examples/count_keys_at_mapper.rb"
273
273
  ]
274
274
 
275
275
  if s.respond_to? :specification_version then
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wukong
3
3
  version: !ruby/object:Gem::Version
4
- hash: 17
4
+ hash: 31
5
5
  prerelease: false
6
6
  segments:
7
7
  - 1
8
8
  - 4
9
- - 11
10
- version: 1.4.11
9
+ - 12
10
+ version: 1.4.12
11
11
  platform: ruby
12
12
  authors:
13
13
  - Philip (flip) Kromer
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-07-30 00:00:00 -05:00
18
+ date: 2010-08-03 00:00:00 +00:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -367,28 +367,28 @@ test_files:
367
367
  - spec/spec_helper.rb
368
368
  - spec/wukong/encoding_spec.rb
369
369
  - spec/wukong/script_spec.rb
370
- - examples/binning_percentile_estimator.rb
371
- - examples/contrib/jeans/normalize.rb
372
- - examples/contrib/jeans/sizes.rb
373
- - examples/corpus/words_to_bigrams.rb
374
- - examples/count_keys.rb
375
- - examples/count_keys_at_mapper.rb
376
- - examples/keystore/cassandra_batch_test.rb
377
- - examples/keystore/conditional_outputter_example.rb
378
- - examples/network_graph/adjacency_list.rb
379
- - examples/network_graph/breadth_first_search.rb
380
- - examples/network_graph/gen_2paths.rb
381
- - examples/network_graph/gen_multi_edge.rb
382
- - examples/network_graph/gen_symmetric_links.rb
383
370
  - examples/pagerank/pagerank.rb
384
371
  - examples/pagerank/pagerank_initialize.rb
385
- - examples/rank_and_bin.rb
386
372
  - examples/sample_records.rb
387
373
  - examples/server_logs/apache_log_parser.rb
388
374
  - examples/server_logs/breadcrumbs.rb
389
375
  - examples/server_logs/user_agent.rb
376
+ - examples/corpus/words_to_bigrams.rb
377
+ - examples/count_keys.rb
378
+ - examples/rank_and_bin.rb
379
+ - examples/binning_percentile_estimator.rb
390
380
  - examples/size.rb
391
- - examples/stats/avg_value_frequency.rb
392
381
  - examples/store/chunked_store_example.rb
393
- - examples/stupidly_simple_filter.rb
382
+ - examples/network_graph/breadth_first_search.rb
383
+ - examples/network_graph/gen_symmetric_links.rb
384
+ - examples/network_graph/gen_multi_edge.rb
385
+ - examples/network_graph/adjacency_list.rb
386
+ - examples/network_graph/gen_2paths.rb
387
+ - examples/keystore/cassandra_batch_test.rb
388
+ - examples/keystore/conditional_outputter_example.rb
389
+ - examples/stats/avg_value_frequency.rb
390
+ - examples/contrib/jeans/sizes.rb
391
+ - examples/contrib/jeans/normalize.rb
394
392
  - examples/word_count.rb
393
+ - examples/stupidly_simple_filter.rb
394
+ - examples/count_keys_at_mapper.rb