monkeyshines 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. data/.document +4 -0
  2. data/.gitignore +43 -0
  3. data/LICENSE +20 -0
  4. data/LICENSE.textile +20 -0
  5. data/README.textile +125 -0
  6. data/Rakefile +105 -0
  7. data/VERSION +1 -0
  8. data/examples/.gitignore +4 -0
  9. data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
  10. data/examples/rename_tree/rename_hdp_tree.rb +151 -0
  11. data/examples/rename_tree/rename_ripd_tree.rb +82 -0
  12. data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
  13. data/examples/shorturls/README.textile +111 -0
  14. data/examples/shorturls/bulkdump_shorturls.rb +46 -0
  15. data/examples/shorturls/bulkload_shorturls.rb +45 -0
  16. data/examples/shorturls/extract_urls.rb +12 -0
  17. data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
  18. data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
  19. data/examples/shorturls/old/shorturl_stats.rb +81 -0
  20. data/examples/shorturls/scrape_shorturls.rb +112 -0
  21. data/examples/shorturls/shorturl_request.rb +29 -0
  22. data/examples/shorturls/shorturl_sequence.rb +121 -0
  23. data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
  24. data/examples/shorturls/start_shorturl_cache.sh +2 -0
  25. data/lib/monkeyshines.rb +31 -0
  26. data/lib/monkeyshines/extensions.rb +16 -0
  27. data/lib/monkeyshines/fetcher.rb +10 -0
  28. data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
  29. data/lib/monkeyshines/fetcher/base.rb +44 -0
  30. data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
  31. data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
  32. data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
  33. data/lib/monkeyshines/monitor.rb +7 -0
  34. data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
  35. data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
  36. data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
  37. data/lib/monkeyshines/options.rb +59 -0
  38. data/lib/monkeyshines/recursive_runner.rb +26 -0
  39. data/lib/monkeyshines/repository/base.rb +57 -0
  40. data/lib/monkeyshines/repository/s3.rb +169 -0
  41. data/lib/monkeyshines/request_stream.rb +11 -0
  42. data/lib/monkeyshines/request_stream/base.rb +32 -0
  43. data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
  44. data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
  45. data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
  46. data/lib/monkeyshines/runner.rb +161 -0
  47. data/lib/monkeyshines/runner_core/options.rb +5 -0
  48. data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
  49. data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
  50. data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
  51. data/lib/monkeyshines/scrape_request.rb +136 -0
  52. data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
  53. data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
  54. data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
  55. data/lib/monkeyshines/store.rb +14 -0
  56. data/lib/monkeyshines/store/base.rb +29 -0
  57. data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
  58. data/lib/monkeyshines/store/conditional_store.rb +57 -0
  59. data/lib/monkeyshines/store/factory.rb +8 -0
  60. data/lib/monkeyshines/store/flat_file_store.rb +84 -0
  61. data/lib/monkeyshines/store/key_store.rb +51 -0
  62. data/lib/monkeyshines/store/null_store.rb +15 -0
  63. data/lib/monkeyshines/store/read_thru_store.rb +22 -0
  64. data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
  65. data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
  66. data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
  67. data/lib/monkeyshines/utils/factory_module.rb +106 -0
  68. data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
  69. data/lib/monkeyshines/utils/logger.rb +15 -0
  70. data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
  71. data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
  72. data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
  73. data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
  74. data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
  75. data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
  76. data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
  77. data/lib/monkeyshines/utils/trollop.rb +744 -0
  78. data/lib/monkeyshines/utils/union_interval.rb +52 -0
  79. data/lib/monkeyshines/utils/uri.rb +70 -0
  80. data/lib/monkeyshines/utils/uuid.rb +32 -0
  81. data/monkeyshines.gemspec +147 -0
  82. data/scrape_from_file.rb +44 -0
  83. data/spec/monkeyshines_spec.rb +7 -0
  84. data/spec/spec_helper.rb +9 -0
  85. metadata +183 -0
data/.document ADDED
@@ -0,0 +1,4 @@
1
+ README.textile
2
+ lib/**/*.rb
3
+ examples/**/*.rb
4
+ LICENSE.textile
data/.gitignore ADDED
@@ -0,0 +1,43 @@
1
+ \#*
2
+ .\#*
3
+ *~
4
+ .DS_Store
5
+ Icon?
6
+ REVISION
7
+ TAGS*
8
+ nohup.out
9
+ .bzr
10
+ .hg
11
+ .svn
12
+
13
+ a.out
14
+ *.o
15
+ *.pyc
16
+ *.so
17
+ *.stackdump
18
+ *.sw?
19
+ *.tmproj
20
+ *_flymake.*
21
+ .project
22
+ .pydevproject
23
+ .settings
24
+ .tasks-cache
25
+ .yardoc
26
+
27
+ /**/*DONTVERSION*
28
+ /**/*private*
29
+ /**/cache/*
30
+ /**/log/*
31
+ /**/tmp/*
32
+ /coverage
33
+ /doc/*
34
+ /pkg/*
35
+ /rdoc/*
36
+
37
+ /db/*.sqlite3
38
+ /db/sphinx
39
+ /config/*.sphinx.conf
40
+ /config/database.yml
41
+ /config/sphinx.yml
42
+ /public/stylesheets/compiled/*
43
+ /vendor/src/**/*
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Philip (flip) Kromer
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/LICENSE.textile ADDED
@@ -0,0 +1,20 @@
1
+ ---
2
+ layout: default
3
+ title: MIT License
4
+ ---
5
+
6
+ h1(gemheader). {{ site.gemname }} %(small):: license%
7
+
8
+ <div class="toggle">
9
+
10
+ h2. MIT License
11
+
12
+ __Copyright (c) 2009 Philip (flip) Kromer__
13
+
14
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19
+
20
+ </div>
data/README.textile ADDED
@@ -0,0 +1,125 @@
1
+ Monkeyshines is a tool for doing an algorithmic scrape.
2
+
3
+ It's designed to handle large-scale scrapes that may exceed the capabilities of single-machine relational databases, so it plays nicely with Hadoop / Wukong, with distributed databases (MongoDB, tokyocabinet, etc.), and distributed job queue (eg "edamame/beanstalk":http://mrflip.github.com/edamame).
4
+
5
+ ---------------------------------------------------------------------------
6
+
7
+ h2. Overview
8
+
9
+ A monkeyshines scraper is simple in principle:
10
+
11
+
12
+
13
+ h2. Dependencies
14
+
15
+ This is best run standalone -- not as a gem; it's still in heavy development. I recommend cloning
16
+
17
+ * http://github.com/mrflip/edamame
18
+ * http://github.com/mrflip/wuclan
19
+ * http://github.com/mrflip/wukong
20
+ * http://github.com/mrflip/monkeyshines (this repo)
21
+
22
+ into a common directory.
23
+
24
+ Additionally, you'll need some of these gems:
25
+
26
+ * addressable (2.1.0)
27
+ * extlib (0.9.12)
28
+ * htmlentities (4.2.0)
29
+
30
+ To build the gem, you'll need
31
+
32
+ * git (1.2.2)
33
+ * jeweler (1.2.1)
34
+ * rake (0.8.7)
35
+ * rspec (1.2.6)
36
+ * rubyforge (1.0.4)
37
+ * sources (0.0.1)
38
+
39
+ And if you spell ruby with a 'j', you'll want
40
+
41
+ * jruby-openssl (0.5.2)
42
+ * json-jruby (1.1.7)
43
+
44
+
45
+
46
+
47
+ h2. Request Queue
48
+
49
+ h3. Periodic requests
50
+
51
+ Request stream can be metered using read-through, scheduled (eg cron), or test-and-sleep.
52
+
53
+ * Scheduled
54
+ * Test and sleep. A queue of resources is cyclically polled, sleeping whenever bored.
55
+
56
+
57
+
58
+ h2. Requests
59
+
60
+ * Base: simple fetch and store of URI. (URI specifies immutable unique resource)
61
+ * : single resource, want to check for updates over time.
62
+ * Timeline:
63
+ ** Message stream, eg. twitter search or user timeline. Want to do paginated requests back to last-seen
64
+ ** Feed: Poll the resource and extract contents, store by GUID. Want to poll frequently enough that single-page request gives full coverage.
65
+
66
+ ---------------------------------------------------------------------------
67
+
68
+ h2. Scraper
69
+
70
+ * HttpScraper --
71
+ ** JSON
72
+ ** HTML
73
+ *** \0 separates records, \t separates initial fields;
74
+ *** map \ to \\, then tab, cr and newline to \t, \r and \n resp.
75
+ *** map tab, cr and newline to &#x9; &#xD; and &#xA; resp.
76
+
77
+
78
+ x9 xa xd x7f
79
+
80
+ * HeadScraper -- records the HEAD parameters
81
+
82
+ ---------------------------------------------------------------------------
83
+
84
+ h2. Store
85
+
86
+
87
+ * Flat file (chunked)
88
+ * Key store
89
+ * Read-through cache
90
+
91
+ ---------------------------------------------------------------------------
92
+
93
+ h2. Periodic
94
+
95
+ * Log only every N requests, or t minutes, or whatever.
96
+ * Restart session every hour
97
+ * Close file and start new chunk every 4 hours or so. (Mitigates data loss if a file is corrupted, makes for easy batch processing).
98
+
99
+ ---------------------------------------------------------------------------
100
+
101
+ h2. Pagination
102
+
103
+ h4. Session
104
+
105
+ * *Twitter Search*: Each req brings in up to 100 results in strict reverse ID (pseudo time) order. If the last item ID in a request is less than the previous scrape session's max_id, or if fewer than 100 results are returned, the scrape session is complete. We maintain two scrape_intervals: one spans from the earliest seen search hit to the highest one from the previous scrape; the other ranges backwards from the highest in _this_ scrape session (the first item in the first successful page request) to the lowest in this scrape session (the last item on the most recent successful page request).
106
+
107
+ ** Set no upper limit on the first request.
108
+ ** Request by page, holding the max_id fixed
109
+ ** Use the lowest ID from the previous request as the new max_id
110
+ ** Use the supplied 'next page' parameter
111
+
112
+ * *Twitter Followers*: Each request brings in 100 followers in reverse order of when the relationship formed. A separate call to the user can tell you how many _total_ followers there are, and you can record how many there were at end of last scrape, but there's some slop (if 100 people in the middle of the list /un/follow and 100 more people at the front /follow/ then the total will be the same). High-degree accounts may have as many as 2M followers (20,000 calls).
113
+
114
+ * *FriendFeed*: Up to four pages. Expiry given by result set of <100 results.
115
+
116
+
117
+ * Paginated: one resource, but requires one or more requests to
118
+ ** Paginated + limit (max_id/since_date): rather than request by increasing page, request one page with a limit parameter until the last-on-page overlaps the previous scrape. For example, say you are scraping search results, and that when you last made the request the max ID was 120_000; the current max_id is 155_000. Request the first page (no limit). Using the last result on each page as the new limit_id until that last result is less than 120_000.
119
+ ** Paginated + stop_on_duplicate: request pages until the last one on the page matches an already-requested instance.
120
+ ** Paginated + velocity_estimate: . For example, say a user acquires on average 4.1 followers/day and it has been 80 days since last scrape. With 100 followers/req you will want to request ceil( 4.1 * 80 / 100 ) = 4 pages.
121
+
122
+ h4. Rescheduling
123
+
124
+ Want to perform next scrape to give a couple pages or a mostly-full page. Need to track a rate (num_items / timespan), clamped to a min_reschedule / max_reschedule bounds.
125
+
data/Rakefile ADDED
@@ -0,0 +1,105 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
8
+ gem.name = "monkeyshines"
9
+ gem.summary = %Q{A simple scraper for directed scrapes of APIs, feed or structured HTML.}
10
+ gem.description = %Q{A simple scraper for directed scrapes of APIs, feed or structured HTML. Plays nicely with wuclan and wukong.}
11
+ gem.email = "flip@infochimps.org"
12
+ gem.homepage = "http://github.com/mrflip/monkeyshines"
13
+ gem.authors = ["Philip (flip) Kromer"]
14
+ gem.add_dependency 'addressable'
15
+ gem.add_dependency 'uuid'
16
+ gem.add_dependency 'wukong'
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
21
+ end
22
+
23
+ require 'spec/rake/spectask'
24
+ Spec::Rake::SpecTask.new(:spec) do |spec|
25
+ spec.libs << 'lib' << 'spec'
26
+ spec.spec_files = FileList['spec/**/*_spec.rb']
27
+ end
28
+
29
+ Spec::Rake::SpecTask.new(:rcov) do |spec|
30
+ spec.libs << 'lib' << 'spec'
31
+ spec.pattern = 'spec/**/*_spec.rb'
32
+ spec.rcov = true
33
+ end
34
+ task :spec => :check_dependencies
35
+ task :default => :spec
36
+
37
+ begin
38
+ require 'reek/rake_task'
39
+ Reek::RakeTask.new do |t|
40
+ t.fail_on_error = true
41
+ t.verbose = false
42
+ t.source_files = ['lib/**/*.rb', 'examples/**/*.rb']
43
+ end
44
+ rescue LoadError
45
+ task :reek do
46
+ abort "Reek is not available. In order to run reek, you must: sudo gem install reek"
47
+ end
48
+ end
49
+
50
+ begin
51
+ require 'roodi'
52
+ require 'roodi_task'
53
+ RoodiTask.new do |t|
54
+ t.verbose = false
55
+ end
56
+ rescue LoadError
57
+ task :roodi do
58
+ abort "Roodi is not available. In order to run roodi, you must: sudo gem install roodi"
59
+ end
60
+ end
61
+
62
+ begin
63
+ require 'yard'
64
+ YARD::Rake::YardocTask.new do |yard|
65
+ end
66
+ rescue LoadError
67
+ task :yardoc do
68
+ abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
69
+ end
70
+ end
71
+
72
+ require 'rake/rdoctask'
73
+ Rake::RDocTask.new do |rdoc|
74
+ require 'rdoc'
75
+ if File.exist?('VERSION')
76
+ version = File.read('VERSION')
77
+ else
78
+ version = ""
79
+ end
80
+
81
+ rdoc.options += [
82
+ '-SHN',
83
+ '-f', 'darkfish', # use darkfish rdoc styler
84
+ ]
85
+ rdoc.rdoc_dir = 'rdoc'
86
+ rdoc.title = "edamame #{version}"
87
+ #
88
+ File.open(File.dirname(__FILE__)+'/.document').each{|line| rdoc.rdoc_files.include(line.chomp) }
89
+ end
90
+
91
+ require 'rake/rdoctask'
92
+ Rake::RDocTask.new do |rdoc|
93
+ if File.exist?('VERSION.yml')
94
+ config = YAML.load(File.read('VERSION.yml'))
95
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
96
+ else
97
+ version = ""
98
+ end
99
+
100
+ rdoc.rdoc_dir = 'rdoc'
101
+ rdoc.title = "monkeyshines #{version}"
102
+ rdoc.rdoc_files.include('README*')
103
+ rdoc.rdoc_files.include('lib/**/*.rb')
104
+ end
105
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.0.2
@@ -0,0 +1,4 @@
1
+ facebook_stream
2
+ friendster_html
3
+ friendster_api
4
+ myspace_api
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'monkeyshines'
4
+ require 'monkeyshines/runner'
5
+ require 'pathname'
6
+
7
+ #
8
+ #
9
+ #
10
+ require 'wuclan/twitter'
11
+ # un-namespace request classes.
12
+ include Wuclan::Twitter::Scrape
13
+
14
+ Monkeyshines::WORK_DIR = '/tmp'
15
+ WORK_DIR = Pathname.new(Monkeyshines::WORK_DIR).realpath.to_s
16
+
17
+ # ===========================================================================
18
+ #
19
+ # scrape_shorturls.rb --
20
+ #
21
+ # To scrape from a list of shortened urls:
22
+ #
23
+ # ./shorturl_random_scrape.rb --from-type=FlatFileStore --from=request_urls.tsv
24
+ #
25
+ # To do a random scrape:
26
+ #
27
+ # ./shorturl_random_scrape.rb --from-type=RandomUrlStream --base-url=tinyurl.com
28
+ # --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix=
29
+ #
30
+ #
31
+ opts = Trollop::options do
32
+ opt :log, "Log to file instead of STDERR"
33
+ # input from file
34
+ opt :from, "URI for scrape store to load from", :type => String
35
+ opt :skip, "Initial lines to skip", :type => Integer
36
+ # output storage
37
+ opt :cache_uri, "URI for cache server", :type => String, :default => ':1978'
38
+ opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
39
+ opt :dest_dir, "Filename base to store output. default ./work/ripd", :default => WORK_DIR+'/ripd'
40
+ opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:date/:handle+:timestamp-:pid.tsv"
41
+ opt :into, "URI for scrape store into", :type => String
42
+ end
43
+ opts[:handle] ||= 'com.twitter'
44
+ scrape_config = YAML.load(File.open(ENV['HOME']+'/.monkeyshines'))
45
+ opts.merge! scrape_config
46
+
47
+ # ******************** Log ********************
48
+ if (opts[:log])
49
+ opts[:log] = (WORK_DIR+'/log/'+File.basename(opts[:from],'.tsv'))
50
+ $stdout = $stderr = File.open(opts[:log]+"-console.log", "a")
51
+ end
52
+
53
+ #
54
+ # Execute the scrape
55
+ #
56
+ scraper = Monkeyshines::Runner.new(
57
+ :dest_store => { :type => :conditional_store,
58
+ :cache => { :type => :tyrant_rdb_key_store, :uri => opts[:cache_uri] },
59
+ :store => opts.merge({ :type => :chunked_flat_file_store }), },
60
+ # :store => { :type => :flat_file_store, :filename => opts[:into] }, },
61
+ :request_stream => { :type => :base, :klass => Monkeyshines::ScrapeRequest,
62
+ :store => { :type => :flat_file_store, :filemode => 'r', :filename => opts[:from] } }
63
+ )
64
+ scraper.run
@@ -0,0 +1,151 @@
1
+ #!/usr/bin/env ruby19
2
+ $: << ENV['HOME']+'/ics/rubygems/trollop-1.14/lib'
3
+ $: << ENV['HOME']+'/ics/rubygems/log4r-1.0.5/src'
4
+ require "monkeyshines/utils/logger"
5
+ require "monkeyshines/utils/filename_pattern.rb"; include Monkeyshines::Utils
6
+ require 'wukong'
7
+ require 'fileutils'
8
+ require 'trollop'
9
+
10
+ #
11
+ # This script demonstrates the use of FilenamePattern.
12
+ #
13
+ # The details are meaningless (it's a throwaway script I used to move to a more
14
+ # unified naming scheme for scraped files), but I think it nicely demonstrates
15
+ # how useful the FilenamePattern class can be.
16
+ #
17
+
18
+ opts = Trollop::options do
19
+ opt :root, "base dir to move (tw0227, etc)", :required => true, :type => String
20
+ opt :go, "actually do rename (otherwise do a dry run)"
21
+ end
22
+
23
+ # The tree to walk
24
+ RIPD_ROOT = '/user/flip/ripd'
25
+
26
+ #
27
+ # Old files to rename
28
+ #
29
+ old_filename_pats = {
30
+ # "/user/flip/#{opts[:root]}/bundled/bundled_bundled_*/*.scrape.tsv" =>
31
+ # ':any_id/bundled/bundled_bundled_:date/bundle+:timestamp.scrape.:ext',
32
+ # "/user/flip/#{opts[:root]}/bundled/bundled_fff_*/*fff_*-0*" => {
33
+ # :patt => ':any_id/bundled/bundled_fff_:date-:any_id/:{flavor}_:date-:segment',
34
+ # :toks => { :ext => 'tsv' } }
35
+ # "/user/flip/#{opts[:root]}/bundled/bundled_bundled_*/bundled-_20*.tsv" => {
36
+ # :patt => ':any_id/bundled/bundled_bundled_:date/bundled-_:date.:ext',
37
+ # :toks => { :flavor => 'bundled', :time => '000000' } }
38
+ # "/user/flip/#{opts[:root]}/bundled/bundled_idok_*/*idok_*-0*" => {
39
+ # :patt => ':any_id/bundled/bundled_idok_0126_pt_0215-:any_id/:{flavor}_0126_pt_0215-:segment',
40
+ # :toks => { :ext => 'tsv', :date => '20090215', :flavor => 'bundled_idok' } }
41
+ # "/user/flip/#{opts[:root]}/bundled/bundled_bundled_*/bundled-_*.tsv" =>
42
+ # ':any_id/bundled/bundled_bundled_:date/bundled-_:date.:ext',
43
+ # '/user/flip/ripd/com.twitter.stream/hosebird-*' =>
44
+ # '/user/flip/ripd/:handle/hosebird-:date-:time.:ext',
45
+ # "/user/flip/#{opts[:root]}/bundled/bundled_public_timeline_*/bundled_public_timeline_*.tsv" => {
46
+ # :patt => ':any_id/bundled/bundled_public_timeline_:date/bundled_public_timeline_:date.:ext',
47
+ # :toks => { :hostname => 'old+timeline' } }
48
+ # "/user/flip/#{opts[:root]}/bundled/bundled_public_timeline_*/bundled_public_timeline_*[0-9]" => {
49
+ # :patt => ':any_id/bundled/bundled_public_timeline_:date/bundled_public_timeline_:date',
50
+ # :toks => { :hostname => 'old+timeline', :ext => 'tsv' } }
51
+ "/user/flip/#{opts[:root]}/bundled/bundled_public_timeline_*/part-*[0-9]" => {
52
+ :patt => ':any_id/bundled/bundled_public_timeline_:date/part-:segment',
53
+ :toks => { :flavor => 'timeline', :ext => 'tsv' } }
54
+ }
55
+
56
+ #
57
+ # How to template new filename
58
+ #
59
+ new_token_defaults = {
60
+ :dest_dir => RIPD_ROOT,
61
+ :pid => '0',
62
+ :hostname => 'old',
63
+ :handle => 'com.twitter'
64
+ }
65
+ new_filename_pat = FilenamePattern.new(
66
+ ':dest_dir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid-:hostname+:flavor.:ext', new_token_defaults)
67
+
68
+ MADE_DIR = { }
69
+ #
70
+ # Rename with logging and without overwriting
71
+ #
72
+ def rename_carefully old_file, new_filename, do_it=false
73
+ Log.info "%s%-87s\t=> %s" % [do_it ? "" : "DRY RUN ", old_file.path, new_filename]
74
+ return unless do_it
75
+ dirname = File.dirname(new_filename)
76
+ if !MADE_DIR[dirname] then Wukong::Dfs::HFile.mkdir_p(dirname) ; MADE_DIR[dirname] = true ; end
77
+ old_file.mv new_filename
78
+ end
79
+
80
+ #
81
+ # Do this thing
82
+ #
83
+ old_filename_pats.each do |files_to_rename, old_filename_rule|
84
+ Log.info "Renaming files matching #{files_to_rename}"
85
+ if old_filename_rule.is_a? Hash
86
+ old_filename_pat = FilenamePattern.new(old_filename_rule[:patt])
87
+ more_toks = old_filename_rule[:toks] || { }
88
+ else
89
+ old_filename_pat = FilenamePattern.new(old_filename_rule)
90
+ more_toks = { }
91
+ end
92
+ #
93
+ # List files and rename
94
+ #
95
+ Wukong::Dfs.list_files(files_to_rename).each do |hdfs_file|
96
+ filename_tokens = old_filename_pat.recognize(hdfs_file.path, { :segment => '\d+', :flavor => '\w+'}) or next
97
+ filename_tokens.merge!(more_toks)
98
+ if (filename_tokens[:timestamp].blank?) && (!filename_tokens[:date].blank?)
99
+ timepart = filename_tokens[:time].blank? ? ('0'+filename_tokens[:segment]) : filename_tokens[:time]
100
+ filename_tokens[:timestamp] = filename_tokens[:date] + (timepart || '000000')
101
+ end
102
+ new_filename = new_filename_pat.make(filename_tokens)
103
+ rename_carefully hdfs_file, new_filename, opts[:go]
104
+ end
105
+ end
106
+
107
+
108
+ # '/user/flip/pkgd/user/flip/tw0227/bundled/bundled_bundled_*.bz2' =>
109
+ # { :pat_str => ':any_id/bundled/bundled_bundled_:date.:ext',
110
+ # :toks => { :ext => '.tsv.bz2' } },
111
+
112
+
113
+
114
+
115
+ # -rw-r--r-- 3 flip supergroup 2055552674 2009-02-18 13:18 /user/flip/tw0218/bundled/bundled_fff_20090126-00000/bundled_fff_20090126-00000
116
+ # -rw-r--r-- 3 flip supergroup 2328853732 2009-02-18 13:08 /user/flip/tw0218/bundled/bundled_fff_20090126-00001/bundled_fff_20090126-00001
117
+ # -rw-r--r-- 3 flip supergroup 630259166 2009-02-18 13:55 /user/flip/tw0218/bundled/bundled_idok_0126_pt_0215-00053/bundled_idok_0126_pt_0215-00053
118
+ # -rw-r--r-- 3 flip supergroup 1714844022 2009-02-17 12:17 /user/flip/tw0218/bundled/bundled_bundled_20090118/bundled-_20090118.tsv
119
+ # -rw-r--r-- 3 flip supergroup 4053904382 2009-02-17 12:18 /user/flip/tw0218/bundled/bundled_bundled_20090119/bundled-_20090119.tsv
120
+ # -rw-r--r-- 3 flip supergroup 3612882035 2009-02-17 12:36 /user/flip/tw0218/bundled/bundled_bundled_20090120/bundled-_20090120.tsv
121
+ # -rw-r--r-- 3 flip supergroup 4309364084 2009-02-17 12:42 /user/flip/tw0218/bundled/bundled_bundled_20090121/bundled-_20090121.tsv
122
+ # -rw-r--r-- 3 flip supergroup 4375598899 2009-02-17 12:49 /user/flip/tw0218/bundled/bundled_bundled_20090122/bundled-_20090122.tsv
123
+ # -rw-r--r-- 3 flip supergroup 2414994564 2009-02-17 12:56 /user/flip/tw0218/bundled/bundled_bundled_20090123/bundled-_20090123.tsv
124
+ # -rw-r--r-- 3 flip supergroup 612 2009-02-17 13:01 /user/flip/tw0218/bundled/bundled_bundled_20090125/bundled-_20090125.tsv
125
+ # -rw-r--r-- 3 flip supergroup 1120007814 2009-02-17 13:03 /user/flip/tw0218/bundled/bundled_bundled_20090204/bundled-_20090204.tsv
126
+ # -rw-r--r-- 3 flip supergroup 534874538 2009-02-17 13:06 /user/flip/tw0218/bundled/bundled_bundled_20090205/bundled-_20090205.tsv
127
+ # -rw-r--r-- 3 flip supergroup 404436617 2009-02-17 13:07 /user/flip/tw0218/bundled/bundled_bundled_20090206/bundled-_20090206.tsv
128
+ # -rw-r--r-- 3 flip supergroup 359037171 2009-02-17 13:08 /user/flip/tw0218/bundled/bundled_bundled_20090207/bundled-_20090207.tsv
129
+ # -rw-r--r-- 3 flip supergroup 332668257 2009-02-17 13:08 /user/flip/tw0218/bundled/bundled_bundled_20090208/bundled-_20090208.tsv
130
+ # -rw-r--r-- 3 flip supergroup 304904205 2009-02-17 13:09 /user/flip/tw0218/bundled/bundled_bundled_20090209/bundled-_20090209.tsv
131
+ # -rw-r--r-- 3 flip supergroup 295217809 2009-02-17 13:09 /user/flip/tw0218/bundled/bundled_bundled_20090210/bundled-_20090210.tsv
132
+ # -rw-r--r-- 3 flip supergroup 257376099 2009-02-17 13:10 /user/flip/tw0218/bundled/bundled_bundled_20090211/bundled-_20090211.tsv
133
+ # -rw-r--r-- 3 flip supergroup 180147925 2009-02-17 13:10 /user/flip/tw0218/bundled/bundled_bundled_20090212/bundled-_20090212.tsv
134
+ # -rw-r--r-- 3 flip supergroup 150611510 2009-02-17 13:11 /user/flip/tw0218/bundled/bundled_bundled_20090214/bundled-_20090214.tsv
135
+ # -rw-r--r-- 3 flip supergroup 154181256 2009-02-17 13:11 /user/flip/tw0218/bundled/bundled_bundled_20090215/bundled-_20090215.tsv
136
+ # -rw-r--r-- 3 flip supergroup 74288574 2009-02-17 13:12 /user/flip/tw0218/bundled/bundled_bundled_20090216/bundled-_20090216.tsv
137
+ # -rw-r--r-- 3 flip supergroup 2006507 2009-02-17 13:12 /user/flip/tw0218/bundled/bundled_bundled_20090217/bundled-_20090217.tsv
138
+ # -rw-r--r-- 3 flip supergroup 232422855 2009-02-17 13:11 /user/flip/tw0219/bundled/bundled_bundled_20090213/bundled-_20090213.tsv
139
+ # -rw-r--r-- 3 flip supergroup 558290288 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00004
140
+ # -rw-r--r-- 3 flip supergroup 1130590440 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00009
141
+ # -rw-r--r-- 3 flip supergroup 523600649 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00025
142
+ # -rw-r--r-- 3 flip supergroup 565480025 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00028
143
+ # -rw-r--r-- 3 flip supergroup 566689087 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00033
144
+ # -rw-r--r-- 3 flip supergroup 545436522 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00036
145
+ # -rw-r--r-- 3 flip supergroup 563565767 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00039
146
+ # -rw-r--r-- 3 flip supergroup 544478849 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00046
147
+ # -rw-r--r-- 3 flip supergroup 566687292 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00055
148
+
149
+ # -rw-r--r-- 3 flip supergroup 561407978 2009-03-03 01:28 /user/flip/tw0227/bundled/bundled_public_timeline_20090227-0301/bundled_public_timeline_20090227-0301.tsv
150
+ # -rw-r--r-- 3 flip supergroup 559109582 2009-03-03 01:22 /user/flip/tw0227/bundled/bundled_public_timeline_20090227-0302/bundled_public_timeline_20090227-0302.tsv
151
+ # -rw-r--r-- 3 flip supergroup 1126272691 2009-03-01 04:53 /user/flip/tw0227/bundled/bundled_public_timeline_20090227-27_28/bundled_public_timeline_20090227-27_28.tsv