monkeyshines 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. data/.document +4 -0
  2. data/.gitignore +43 -0
  3. data/LICENSE +20 -0
  4. data/LICENSE.textile +20 -0
  5. data/README.textile +125 -0
  6. data/Rakefile +105 -0
  7. data/VERSION +1 -0
  8. data/examples/.gitignore +4 -0
  9. data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
  10. data/examples/rename_tree/rename_hdp_tree.rb +151 -0
  11. data/examples/rename_tree/rename_ripd_tree.rb +82 -0
  12. data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
  13. data/examples/shorturls/README.textile +111 -0
  14. data/examples/shorturls/bulkdump_shorturls.rb +46 -0
  15. data/examples/shorturls/bulkload_shorturls.rb +45 -0
  16. data/examples/shorturls/extract_urls.rb +12 -0
  17. data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
  18. data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
  19. data/examples/shorturls/old/shorturl_stats.rb +81 -0
  20. data/examples/shorturls/scrape_shorturls.rb +112 -0
  21. data/examples/shorturls/shorturl_request.rb +29 -0
  22. data/examples/shorturls/shorturl_sequence.rb +121 -0
  23. data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
  24. data/examples/shorturls/start_shorturl_cache.sh +2 -0
  25. data/lib/monkeyshines.rb +31 -0
  26. data/lib/monkeyshines/extensions.rb +16 -0
  27. data/lib/monkeyshines/fetcher.rb +10 -0
  28. data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
  29. data/lib/monkeyshines/fetcher/base.rb +44 -0
  30. data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
  31. data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
  32. data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
  33. data/lib/monkeyshines/monitor.rb +7 -0
  34. data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
  35. data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
  36. data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
  37. data/lib/monkeyshines/options.rb +59 -0
  38. data/lib/monkeyshines/recursive_runner.rb +26 -0
  39. data/lib/monkeyshines/repository/base.rb +57 -0
  40. data/lib/monkeyshines/repository/s3.rb +169 -0
  41. data/lib/monkeyshines/request_stream.rb +11 -0
  42. data/lib/monkeyshines/request_stream/base.rb +32 -0
  43. data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
  44. data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
  45. data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
  46. data/lib/monkeyshines/runner.rb +161 -0
  47. data/lib/monkeyshines/runner_core/options.rb +5 -0
  48. data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
  49. data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
  50. data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
  51. data/lib/monkeyshines/scrape_request.rb +136 -0
  52. data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
  53. data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
  54. data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
  55. data/lib/monkeyshines/store.rb +14 -0
  56. data/lib/monkeyshines/store/base.rb +29 -0
  57. data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
  58. data/lib/monkeyshines/store/conditional_store.rb +57 -0
  59. data/lib/monkeyshines/store/factory.rb +8 -0
  60. data/lib/monkeyshines/store/flat_file_store.rb +84 -0
  61. data/lib/monkeyshines/store/key_store.rb +51 -0
  62. data/lib/monkeyshines/store/null_store.rb +15 -0
  63. data/lib/monkeyshines/store/read_thru_store.rb +22 -0
  64. data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
  65. data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
  66. data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
  67. data/lib/monkeyshines/utils/factory_module.rb +106 -0
  68. data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
  69. data/lib/monkeyshines/utils/logger.rb +15 -0
  70. data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
  71. data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
  72. data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
  73. data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
  74. data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
  75. data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
  76. data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
  77. data/lib/monkeyshines/utils/trollop.rb +744 -0
  78. data/lib/monkeyshines/utils/union_interval.rb +52 -0
  79. data/lib/monkeyshines/utils/uri.rb +70 -0
  80. data/lib/monkeyshines/utils/uuid.rb +32 -0
  81. data/monkeyshines.gemspec +147 -0
  82. data/scrape_from_file.rb +44 -0
  83. data/spec/monkeyshines_spec.rb +7 -0
  84. data/spec/spec_helper.rb +9 -0
  85. metadata +183 -0
@@ -0,0 +1,82 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'
3
+ $: << ENV['HOME']+'/ics/rubygems/trollop-1.14/lib'
4
+ $: << ENV['WUKONG_PATH'] if ENV['WUKONG_PATH']
5
+ require "monkeyshines/utils/logger"
6
+ require "monkeyshines/utils/filename_pattern.rb"; include Monkeyshines::Utils
7
+ require 'wukong/extensions/hash'
8
+ require 'fileutils'
9
+ require 'trollop'
10
+
11
+ #
12
+ # This script demonstrates the use of FilenamePattern.
13
+ #
14
+ # The details are meaningless (it's a throwaway script I used to move to a more
15
+ # unified naming scheme for scraped files), but I think it nicely demonstrates
16
+ # how useful the FilenamePattern class can be.
17
+ #
18
+
19
+ opts = Trollop::options do
20
+ opt :dry_run, "perform a dry run (no actions are taken)"
21
+ end
22
+
23
+ # The tree to walk
24
+ RIPD_ROOT = '/data/ripd'
25
+
26
+ #
27
+ # Old files to rename
28
+ #
29
+ old_filename_pats = {
30
+ RIPD_ROOT+'/com.tw/com.twitter/bundled/_200*/**/*' =>
31
+ RIPD_ROOT+'/com.tw/:handle/bundled/_:date/_:hour/bundle+:timestamp.scrape.:ext',
32
+ # RIPD_ROOT+'/com.tw/com.twitter.stream/hosebird-*' =>
33
+ # RIPD_ROOT+'/com.tw/:handle/hosebird-:date-:time.:ext',
34
+ # RIPD_ROOT+'/com.tw/com.twitter.search/*/com.twitter.search+*[^r].tsv' =>
35
+ # RIPD_ROOT+'/com.tw/:handle/:date/:handle+:timestamp-:pid.:ext'
36
+ }
37
+
38
+ #
39
+ # How to template new filename
40
+ #
41
+ new_token_defaults = {
42
+ :dest_dir => RIPD_ROOT,
43
+ :pid => '0',
44
+ :hostname => 'old',
45
+ }
46
+ new_filename_pat = FilenamePattern.new(
47
+ ':dest_dir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid-:hostname.:ext', new_token_defaults)
48
+
49
+ #
50
+ # Rename with logging and without overwriting
51
+ #
52
+ def rename_carefully old_filename, new_filename, dry_run=false
53
+ if File.exists?(new_filename) then Log.warn "Cowardly refusing to overwrite #{new_filename} from #{old_filename}" ; next ; end
54
+ Log.info "%s%-60s \t=> %s" % [dry_run ? 'DRY RUN - ' : '', old_filename, new_filename]
55
+ return if dry_run
56
+ FileUtils.mkdir_p File.dirname(new_filename)
57
+ FileUtils.mv old_filename, new_filename
58
+ end
59
+
60
+ def fix_filename_tokens! filename_tokens
61
+ if (!filename_tokens[:timestamp]) && (filename_tokens[:date] || filename_tokens[:time])
62
+ filename_tokens[:timestamp] = "%s%s" % [filename_tokens[:date], filename_tokens[:time]]
63
+ end
64
+ end
65
+
66
+ #
67
+ # Do this thing
68
+ #
69
+ old_filename_pats.each do |files_to_rename, old_filename_pat_str|
70
+ old_filename_pat = FilenamePattern.new(old_filename_pat_str)
71
+ Log.info "Renaming files matching #{files_to_rename}"
72
+ Dir[files_to_rename].sort.each do |old_filename|
73
+ next unless File.file?(old_filename)
74
+ filename_tokens = old_filename_pat.recognize(old_filename) or next
75
+ fix_filename_tokens! filename_tokens
76
+ new_filename = new_filename_pat.make(filename_tokens)
77
+ rename_carefully old_filename, new_filename, opts[:dry_run]
78
+ end
79
+ end
80
+
81
+ # example_str = '/data/ripd/_com/_tw/com.twitter/bundled/_20090224/_18/bundle+20090224180354.scrape.tsv.bz2'
82
+ # p [old_filename_pat.pattern, old_filename_pat.make_recognizer(old_token_vals), old_filename_pat.recognize(example_str)]
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'monkeyshines'
4
+ require 'monkeyshines/runner'
5
+ require 'feedzirra'
6
+
7
+
8
+ #!/usr/bin/env ruby
9
+ require 'rubygems'
10
+ require 'monkeyshines'
11
+ require 'monkeyshines/recursive_runner'
12
+ WORK_DIR = Subdir[__FILE__,'work'].expand_path
13
+ puts WORK_DIR
14
+
15
+ #
16
+ # Set up scrape
17
+ #
18
+
19
+ #
20
+ # * jobs stream from an edamame job queue.
21
+ # * Many jobs generate paginated requests, stopping when a response overlaps the
22
+ # prev_max item.
23
+ # * Each request is fetched with the standard HTTP fetcher.
24
+ #
25
+ # * low-generation jobs are rescheduled based on the observed item rate
26
+ # * jobs can spawn recursive requests. These have their request_generation
27
+ # incremented
28
+ # * results are sent to a ChunkedFlatFileStore
29
+ #
30
+
31
+ #
32
+ # Create runner
33
+ #
34
+ scraper = Monkeyshines::Runner.new({
35
+ :log => { :iters => 100, :dest => Monkeyshines::CONFIG[:handle] },
36
+ :source => { :type => Monkeyshines::RequestStream::KlassHashRequestStream,
37
+ :store => { :type => Monkeyshines::RequestStream::EdamameQueue,
38
+ :queue => { :uris => ['localhost:11210'], :type => 'BeanstalkQueue', },
39
+ :store => { :uri => ':11211', :type => 'TyrantStore', }, }, },
40
+ :dest => { :type => :conditional_store,
41
+ :cache => { :uri => ':11212', },
42
+ :store => { :rootdir => WORK_DIR },},
43
+ # :fetcher => { :type => :fake_fetcher },
44
+ :force_fetch => false,
45
+ :sleep_time => 0.2,
46
+ })
47
+
48
+ # Execute the scrape
49
+ loop do
50
+ puts Time.now
51
+ scraper.run
52
+ end
@@ -0,0 +1,111 @@
1
+ This is a demonstration script showing how to inhale translations from URL shorteners such as http://tinyurl.com/ or http://bit.ly/. It tries to do so as efficiently as possible, using persistent HTTP connections for reduced load and a centralized request cache to reduce unnecessary requests.
2
+
3
+ You can feed it a sequential list of urls or have it wander within a range of request strings.
4
+
5
+ h2. Setup
6
+
7
+ You will need:
8
+
9
+ * "Wukong":http://mrflip.com mostly for several utility methods, though by the time you have a few ten million urls to process you may find it handy.
10
+
11
+ * "Tokyo Tyrant":tokyocabinet.sourceforge.net/ or other key-value database to track which URLs have been visited. TokyoTyrant's speed and network interface let you efficiently run many scrapers off the same central DB. You need to get both the libraries _and_ the ruby interface for each of tokyo tyrant and tokyo cabinet.
12
+
13
+ If you're using tokyo tyrant, you should consider optimizing the database:
14
+
15
+ tcrmgr optimize -port 10042 localhost 'bnum=20000000#opts=l'
16
+
17
+ will pre-allocate 20 million buckets and a 64-bit index. (You want at least twice as many buckets as entries).
18
+
19
+ h2. Running
20
+
21
+ *Source of URLs to scrape*:
22
+
23
+ _URLs taken from input files_:
24
+
25
+ * --from-type=FlatFileStore if you want to load from a flat file stream.
26
+ * --from should give the path to the input file: one url per line, as many as you care to supply.
27
+
28
+ *OR*
29
+
30
+ _URLs randomly generated in a range_:
31
+
32
+ * --from-type=RandomUrlStream if you want to use the
33
+ * --base-url: the domain to scrape (required).
34
+ * --min-limit and --max-limit give a numeric range (normal base-10 number) to explore.
35
+ * --encoding-radix: Most shorteners use base-36: the characters 0-9 and a-z are used in ascending order. Some, such as bit.ly, use base-62 (0-9a-zA-Z) by being case-sensitive: http://bit.ly/ANVgN and http://bit.ly/anvgN are different. Specify --encoding-radix=36 if the shortener ignores case, or --encoding-radix=62 if it is case sensitive. If the base-url is either bit.ly or tinyurl.com you can omit this parameter.
36
+
37
+ *Output files*:
38
+
39
+ * --dumpfile-chunk-time: How often to rotate output files.
40
+ * --dumpfile-dir: Base part of the output filename.
41
+ * --dumpfile-pattern: Pattern for dumpfile names. Defaults to
42
+ @ :dumpfile_dir/:handle_prefix/:handle/:date/:handle+:datetime-:pid.tsv @
43
+ With --dumpfile-dir=/data/ripd --handle=bitly and the default dumpfile-pattern, the scraper will store into files named
44
+ /data/ripd/shortu/shorturl-bitly/20090708/shorturl-bitly+20090708123456-8342.tsv
45
+ This may seem insane but when you've had multiple scrapers running for two months you'll thank me.
46
+ * --cache-loc hostname:port for the requested cache. This should be a tokyo tyrant server, though it should be easy to swap it out for another distributed key-value store.
47
+
48
+ *Other*:
49
+
50
+ * --log: optional log file; otherwise outputs progress to the console
51
+
52
+ h2. Output Files:
53
+
54
+ As written, the scraper uses the cache database as only a visited-yet? flag (storing the scraped_at timestamp but nothing else.) The actual scrape data is stored in flat files. If you want to store everything in the database, swap out the ConditionalStore for a ReadThruStore (and perhaps back the ReadThruStore with a table-type database such as TyrantTdbKeyStore)
55
+
56
+ h3. Output file format
57
+
58
+ The output is stored in a series of files with tab-separated rows. Each row holds information about one url:
59
+
60
+ @
61
+ class_name (ignore) url date code# resp msg destination url
62
+ shorturl_request http://bit.ly/wukong 20090720003304 301 Moved http://github.com/mrflip/wukong
63
+ @
64
+
65
+ In order:
66
+ * a dummy field giving the class name.
67
+ * the requested URL
68
+ * the date, stored as YYYYmmddHHMMSS
69
+ * response_code: the "HTTP status code,":http://en.wikipedia.org/wiki/List_of_HTTP_status_codes see below for explanation. (BTW - why has nobody released a parody of "I've got hos in area codes":http://en.wikipedia.org/wiki/Area_Codes_(song) using HTTP status codes? You have disappointed me, internet.)
70
+ * response_message: the message accompanying that response code.
71
+ * contents: the redirect URL, or nothing if none was returned.
72
+
73
+ h3. File Chunking
74
+
75
+ Every four hours (or according to the --chunk-time parameter) the scraper will close the current dump file and open a new, timestamped one following the same pattern. This mitigates the damage from a corrupted file and lets you migrate the output products to S3 or other offline storage. Make sure you include a :datetime somewhere in the filename, and at least one of :hostname or :pid if you have multiple scraper robots at work.
76
+
77
+ h2. Scraper
78
+
79
+ * Does a HEAD only -- the scraper doesn't request the contents of the page, only the redirect header.
80
+ * Persistent connections -- opens one connection and
81
+ * Backoff -- if it receives server error response codes the scraper will sleep for several seconds before attempting the next request.
82
+
83
+ h2. Response codes:
84
+
85
+ * 301 Moved - the traditional status code for a redirect to the expanded url
86
+ * 301 Moved Permanently - this is used interchangeably by bit.ly, no idea why
87
+ * 302 Found - bit.ly uses this for links marked as spam -- they land you on an 'are you sure?' page on bit.ly's servers.
88
+ * 302 Moved Temporarily - ??? don't know the diff between _302 Moved Temporarily_ and _307 Temporary Redirect_ in theory or practice.
89
+ * 307 Temporary Redirect - Used by some shorteners, such as budurl.com, that let you change a URL after the fact.
90
+ Additionally, these non-redirect urls are meaningful:
91
+ * 200 OK - used by tinyurl.com to indicate a nonexistent tinyurl.
92
+ * 200 Apple - no, really. Returned by ad.vu which does an OK and then a meta refresh. (Assumedly so they get a pageview on their ad network)
93
+ * 404 Not Found - For bit.ly, a removed or non-existent url string. For tinyurl, an ill-formed url string, like 22vcnf?ic or 22lsj4...some (well-formed but missing ones get a 200 OK).
94
+
95
+ h2. Seed data
96
+
97
+ To prevent unnecessary load on the shorteners' service, you can download several million URL expansions from infochimps.org. Feel free to contribute your efforts there as well.
98
+
99
+ You will want to use the @bulkload_shorturls.rb@ script to fill the request sentinel cache.
100
+
101
+ h2. See Also:
102
+
103
+ * *On URL Shorteners*:
104
+ ** http://joshua.schachter.org/2009/04/on-url-shorteners.html
105
+ ** http://snook.ca/archives/php/url-shortener/
106
+ ** http://simonwillison.net/2009/Apr/11/revcanonical/
107
+ * *Archive Team effort to scrape*:
108
+ ** http://archiveteam.org/index.php?title=TinyURL
109
+ * *Base 62 encoding*:
110
+ ** http://refactormycode.com/codes/125-base-62-encoding
111
+ ** http://github.com/jtzemp/base62/tree/master
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
3
+ require 'rubygems'
4
+ require 'trollop'
5
+ require 'wukong'
6
+ require 'monkeyshines'
7
+ require 'shorturl_request'
8
+ require 'shorturl_sequence'
9
+ require 'monkeyshines/utils/uri'
10
+ require 'time'
11
+
12
+ #
13
+ # Command line options
14
+ #
15
+ opts = Trollop::options do
16
+ # opt :from_type, 'Class name for scrape store to load from', :type => String
17
+ # opt :from, 'URI for scrape store to load from', :type => String
18
+ opt :handle, "Handle for scrape", :type => String
19
+ # opt :into, 'Filename for flat TSV dump', :type => String
20
+ opt :log, 'File to store log', :type => String
21
+ end
22
+
23
+ # ******************** Log ********************
24
+
25
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 20_000, :time => 30)
26
+
27
+ # ******************** Read From ********************
28
+ TYRANT_PORTS = { 'tinyurl' => ":10001", 'bitly' => ":10002", 'other' => ":10003" }
29
+ src_uri = TYRANT_PORTS[opts[:handle]] or raise "Need a handle (bitly, tinyurl or other). got: #{handle}"
30
+ src_store = Monkeyshines::Store::TyrantTdbKeyStore.new(src_uri)
31
+ Log.info "Loaded store with #{src_store.size}"
32
+
33
+ # ******************** Write into ********************
34
+ # dest_store = Monkeyshines::Store::FlatFileStore.new(opts[:into], opts.reverse_merge(:filemode => 'w'))
35
+ RDB_PORTS = { 'tinyurl' => ":10042", 'bitly' => ":10043", 'other' => ":10044" }
36
+ dest_uri = RDB_PORTS[opts[:handle]] or raise "Need a handle (bitly, tinyurl or other). got: #{handle}"
37
+ dest_store = Monkeyshines::Store::TyrantRdbKeyStore.new(dest_uri)
38
+ # src_store_klass = Wukong.class_from_resource('Monkeyshines::Store::'+opts[:from_type])
39
+ # src_store = src_store_klass.new(opts[:from])
40
+ Log.info "Loading into store with #{dest_store.size}"
41
+
42
+ # ******************** Dump ********************
43
+ src_store.each do |key, hsh|
44
+ periodic_log.periodically{ [src_store.size, dest_store.size, hsh.values_of('url', 'scraped_at', 'response_code', 'response_message', 'contents')] }
45
+ dest_store.save hsh['url'], hsh['scraped_at']
46
+ end
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'tokyocabinet' ; require 'tokyotyrant'
4
+ require 'trollop'
5
+ $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
6
+ require 'wukong'
7
+ require 'monkeyshines'
8
+ require 'shorturl_request'
9
+ require 'shorturl_sequence'
10
+ require 'multiplex_shorturl_cache'
11
+
12
+ # Command Line options
13
+ opts = Trollop::options do
14
+ opt :from_type, 'Class name for scrape store to load from', :type => String
15
+ opt :from, 'URI for scrape store to load from', :type => String
16
+ opt :handle, "Handle for scrape", :type => String
17
+ opt :log, 'File to store log', :type => String
18
+ end
19
+ Trollop::die :from_type unless opts[:from_type]
20
+
21
+ # ******************** Log ********************
22
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 20_000, :time => 30)
23
+
24
+ # ******************** Load from flat file ********************
25
+ src_store_klass = Wukong.class_from_resource('Monkeyshines::Store::'+opts[:from_type])
26
+ src_store = src_store_klass.new(opts[:from], opts.merge(:filemode => 'r'))
27
+
28
+ # ******************** Store into read-thru cache ********************
29
+ RDB_PORTS = { 'tinyurl' => "localhost:10042", 'bitly' => "localhost:10043", 'other' => "localhost:10044" }
30
+ dest_uri = RDB_PORTS[opts[:handle]] or raise "Need a handle (bitly, tinyurl or other). got: #{handle}"
31
+ dest_store = Monkeyshines::Store::TyrantRdbKeyStore.new(dest_uri)
32
+ # dest_store = Monkeyshines::Store::MultiplexShorturlCache.new(RDB_PORTS)
33
+
34
+ # ******************** Dump ********************
35
+ src_store.each do |_, url, scat, *args|
36
+ periodic_log.periodically{ [dest_store.size, url, scat, args] }
37
+ dest_store.set_nr url, scat
38
+ end
39
+
40
+ #
41
+ # On a DB with 2M entries, this loads about 700/s
42
+ # You can optimize with something like
43
+ # EXPECTED_MAX_KEYS = 20_000_000
44
+ # store.db.optimize("bnum=#{2*EXPECTED_MAX_KEYS}#opts=l") # large (64-bit), 40M buckets
45
+ #
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+ $: << '/home/flip/ics/wukong/lib' # 'ENV['WUKONG_DIR'] if ENV['WUKONG_DIR']
3
+ require 'wukong'
4
+
5
+ SHORTURL_RE = %r{\Ahttp://(?:1link.in|4url.cc|6url.com|adjix.com|ad.vu|bellypath.com|bit.ly|bkite.com|budurl.com|canurl.com|chod.sk|cli.gs|decenturl.com|dn.vc|doiop.com|dwarfurl.com|easyuri.com|easyurl.net|ff.im|go2cut.com|gonext.org|hulu.com|hypem.com|ifood.tv|ilix.in|is.gd|ix.it|jdem.cz|jijr.com|kissa.be|kurl.us|litturl.com|lnkurl.com|memurl.com|metamark.net|miklos.dk|minilien.com|minurl.org|muhlink.com|myurl.in|myurl.us|notlong.com|ow.ly|plexp.com|poprl.com|qurlyq.com|redirx.com|s3nt.com|shorterlink.com|shortlinks.co.uk|short.to|shorturl.com|shrinklink.co.uk|shrinkurl.us|shrt.st|shurl.net|simurl.com|shorl.com|smarturl.eu|snipr.com|snipurl.com|snurl.com|sn.vc|starturl.com|surl.co.uk|tighturl.com|timesurl.at|tiny123.com|tiny.cc|tinylink.com|tinyurl.com|tobtr.com|traceurl.com|tr.im|tweetburner.com|twitpwr.com|twitthis.com|twurl.nl|u.mavrev.com|ur1.ca|url9.com|urlborg.com|urlbrief.com|urlcover.com|urlcut.com|urlhawk.com|url-press.com|urlsmash.com|urltea.com|urlvi.be|vimeo.com|wlink.us|xaddr.com|xil.in|xrl.us|x.se|xs.md|yatuc.com|yep.it|yweb.com|zi.ma|w3t.org)/.}i
6
+ class Mapper < Wukong::Streamer::Base
7
+ def process rsrc, url, tweet_id, user_id
8
+ yield url if url =~ SHORTURL_RE
9
+ end
10
+ end
11
+
12
+ Wukong::Script.new(Mapper, nil, :reduce_command => '/usr/bin/uniq').run
@@ -0,0 +1,32 @@
1
+ class Monkeyshines::Store::MultiplexShorturlCache < Monkeyshines::Store::ReadThruStore
2
+ attr_accessor :dests, :store_uris
3
+
4
+ # Store into tokyo tyrant
5
+ # TYRANT_PORTS = { 'tinyurl' => ":10001", 'bitly' => ":10002", 'other' => ":10003" }
6
+
7
+ def initialize store_uris, options={}
8
+ self.dests = { }
9
+ store_uris.each do |handle, uri|
10
+ dests[handle] = Monkeyshines::Store::ReadThruStore.new uri
11
+ end
12
+ end
13
+
14
+ def set key, *args, &block
15
+ case
16
+ when (key =~ %r{^http://tinyurl.com/(.*)}) then dests['tinyurl'].set($1, *args, &block)
17
+ when (key =~ %r{^http://bit.ly/(.*)}) then dests['bitly' ].set($1, *args, &block)
18
+ else dests['other' ].set(key, *args, &block)
19
+ end
20
+ end
21
+
22
+ def size
23
+ dests.inject(0){|sum,hand_db| sz += hand_db[1].size }
24
+ end
25
+ def close
26
+ dests.each{|hdl,db| db.close }
27
+ end
28
+ end
29
+
30
+
31
+
32
+
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
3
+ require 'rubygems'
4
+ require 'trollop'
5
+ require 'wukong'
6
+ require 'monkeyshines'
7
+ require 'shorturl_request'
8
+ require 'shorturl_sequence'
9
+ require 'monkeyshines/utils/uri'
10
+
11
+ #
12
+ # Command line options
13
+ #
14
+ opts = Trollop::options do
15
+ opt :from_type, 'Class name for scrape store to load from', :type => String
16
+ opt :from, 'URI for scrape store to load from', :type => String
17
+ opt :into, 'Filename for flat TSV dump', :type => String
18
+ opt :log, 'File to store log', :type => String
19
+ end
20
+ Trollop::die :from_type unless opts[:from_type]
21
+
22
+ # ******************** Read From ********************
23
+ src_store_klass = Wukong.class_from_resource('Monkeyshines::Store::'+opts[:from_type])
24
+ src_store = src_store_klass.new(opts[:from])
25
+ Log.info "Loaded store with #{src_store.size}"
26
+
27
+ # ******************** Write into ********************
28
+ DUMPFILE_BASE = opts[:into]
29
+ def make_store uri
30
+ Monkeyshines::Store::FlatFileStore.new "#{DUMPFILE_BASE+"-"+uri}.tsv", :filemode => 'w'
31
+ end
32
+ dests = { }
33
+ [ 'tinyurl', 'bitly', 'other'
34
+ ].each do |handle|
35
+ dests[handle] = make_store handle
36
+ end
37
+
38
+ # ******************** Log ********************
39
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 20_000, :time => 30)
40
+
41
+ # ******************** Cross Load ********************
42
+ # Read , process, dump
43
+ iter = 0
44
+ src_store.each do |key, hsh|
45
+ hsh['contents'] ||= hsh.delete 'expanded_url'
46
+ hsh['response_code'] = nil if hsh['response_code'] == 'nil'
47
+ hsh['contents'] = nil if hsh['contents'] == 'nil'
48
+ unless hsh['contents'] || hsh['response_code']
49
+ # Log.info "removing #{hsh.inspect}"
50
+ src_store.db.out(key)
51
+ next
52
+ end
53
+ hsh['response_message'] = nil if hsh['response_message'] == 'nil'
54
+ hsh['url'] ||= hsh.delete 'short_url'
55
+ req = ShorturlRequest.from_hash hsh
56
+ periodic_log.periodically{ [src_store.size, req.to_flat] }
57
+
58
+ req.contents = Addressable::URI.scrub_url req.contents if req.contents
59
+
60
+ case
61
+ when (key =~ %r{^http://tinyurl.com/(.*)}) then dests['tinyurl'].save req
62
+ when (key =~ %r{^http://bit.ly/(.*)}) then dests['bitly' ].save req
63
+ else dests['other' ].save req
64
+ end
65
+ # src_store.save(key, req.to_hash.compact)
66
+ end