RubyGems - monkeyshines - Versions diffs - 0.0.2 - Mend

monkeyshines 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

data/.document +4 -0
data/.gitignore +43 -0
data/LICENSE +20 -0
data/LICENSE.textile +20 -0
data/README.textile +125 -0
data/Rakefile +105 -0
data/VERSION +1 -0
data/examples/.gitignore +4 -0
data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
data/examples/rename_tree/rename_hdp_tree.rb +151 -0
data/examples/rename_tree/rename_ripd_tree.rb +82 -0
data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
data/examples/shorturls/README.textile +111 -0
data/examples/shorturls/bulkdump_shorturls.rb +46 -0
data/examples/shorturls/bulkload_shorturls.rb +45 -0
data/examples/shorturls/extract_urls.rb +12 -0
data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
data/examples/shorturls/old/shorturl_stats.rb +81 -0
data/examples/shorturls/scrape_shorturls.rb +112 -0
data/examples/shorturls/shorturl_request.rb +29 -0
data/examples/shorturls/shorturl_sequence.rb +121 -0
data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
data/examples/shorturls/start_shorturl_cache.sh +2 -0
data/lib/monkeyshines.rb +31 -0
data/lib/monkeyshines/extensions.rb +16 -0
data/lib/monkeyshines/fetcher.rb +10 -0
data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
data/lib/monkeyshines/fetcher/base.rb +44 -0
data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
data/lib/monkeyshines/monitor.rb +7 -0
data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
data/lib/monkeyshines/options.rb +59 -0
data/lib/monkeyshines/recursive_runner.rb +26 -0
data/lib/monkeyshines/repository/base.rb +57 -0
data/lib/monkeyshines/repository/s3.rb +169 -0
data/lib/monkeyshines/request_stream.rb +11 -0
data/lib/monkeyshines/request_stream/base.rb +32 -0
data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
data/lib/monkeyshines/runner.rb +161 -0
data/lib/monkeyshines/runner_core/options.rb +5 -0
data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
data/lib/monkeyshines/scrape_request.rb +136 -0
data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
data/lib/monkeyshines/store.rb +14 -0
data/lib/monkeyshines/store/base.rb +29 -0
data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
data/lib/monkeyshines/store/conditional_store.rb +57 -0
data/lib/monkeyshines/store/factory.rb +8 -0
data/lib/monkeyshines/store/flat_file_store.rb +84 -0
data/lib/monkeyshines/store/key_store.rb +51 -0
data/lib/monkeyshines/store/null_store.rb +15 -0
data/lib/monkeyshines/store/read_thru_store.rb +22 -0
data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
data/lib/monkeyshines/utils/factory_module.rb +106 -0
data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
data/lib/monkeyshines/utils/logger.rb +15 -0
data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
data/lib/monkeyshines/utils/trollop.rb +744 -0
data/lib/monkeyshines/utils/union_interval.rb +52 -0
data/lib/monkeyshines/utils/uri.rb +70 -0
data/lib/monkeyshines/utils/uuid.rb +32 -0
data/monkeyshines.gemspec +147 -0
data/scrape_from_file.rb +44 -0
data/spec/monkeyshines_spec.rb +7 -0
data/spec/spec_helper.rb +9 -0
metadata +183 -0

data/examples/rename_tree/rename_ripd_tree.rb ADDED Viewed

@@ -0,0 +1,82 @@
+#!/usr/bin/env ruby
+$: << File.dirname(__FILE__)+'/../../lib'
+$: << ENV['HOME']+'/ics/rubygems/trollop-1.14/lib'
+$: << ENV['WUKONG_PATH'] if ENV['WUKONG_PATH']
+require "monkeyshines/utils/logger"
+require "monkeyshines/utils/filename_pattern.rb"; include Monkeyshines::Utils
+require 'wukong/extensions/hash'
+require 'fileutils'
+require 'trollop'
+#
+# This script demonstrates the use of FilenamePattern.
+#
+# The details are meaningless (it's a throwaway script I used to move to a more
+# unified naming scheme for scraped files), but I think it nicely demonstrates
+# how useful the FilenamePattern class can be.
+#
+opts = Trollop::options do
+  opt :dry_run,      "perform a dry run (no actions are taken)"
+end
+# The tree to walk
+RIPD_ROOT = '/data/ripd'
+#
+# Old files to rename
+#
+old_filename_pats = {
+  RIPD_ROOT+'/com.tw/com.twitter/bundled/_200*/**/*' =>
+    RIPD_ROOT+'/com.tw/:handle/bundled/_:date/_:hour/bundle+:timestamp.scrape.:ext',
+  # RIPD_ROOT+'/com.tw/com.twitter.stream/hosebird-*' =>
+  #   RIPD_ROOT+'/com.tw/:handle/hosebird-:date-:time.:ext',
+  # RIPD_ROOT+'/com.tw/com.twitter.search/*/com.twitter.search+*[^r].tsv' =>
+  #   RIPD_ROOT+'/com.tw/:handle/:date/:handle+:timestamp-:pid.:ext'
+}
+#
+# How to template new filename
+#
+new_token_defaults = {
+  :dest_dir =>   RIPD_ROOT,
+  :pid      => '0',
+  :hostname => 'old',
+}
+new_filename_pat = FilenamePattern.new(
+  ':dest_dir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid-:hostname.:ext', new_token_defaults)
+#
+# Rename with logging and without overwriting
+#
+def rename_carefully old_filename, new_filename, dry_run=false
+  if File.exists?(new_filename) then Log.warn "Cowardly refusing to overwrite #{new_filename} from #{old_filename}" ; next ; end
+  Log.info "%s%-60s \t=> %s" % [dry_run ? 'DRY RUN - ' : '', old_filename, new_filename]
+  return if dry_run
+  FileUtils.mkdir_p File.dirname(new_filename)
+  FileUtils.mv old_filename, new_filename
+end
+def fix_filename_tokens! filename_tokens
+  if (!filename_tokens[:timestamp]) && (filename_tokens[:date] || filename_tokens[:time])
+    filename_tokens[:timestamp] = "%s%s" % [filename_tokens[:date], filename_tokens[:time]]
+  end
+end
+#
+# Do this thing
+#
+old_filename_pats.each do |files_to_rename, old_filename_pat_str|
+  old_filename_pat = FilenamePattern.new(old_filename_pat_str)
+  Log.info "Renaming files matching #{files_to_rename}"
+  Dir[files_to_rename].sort.each do |old_filename|
+    next unless File.file?(old_filename)
+    filename_tokens = old_filename_pat.recognize(old_filename) or next
+    fix_filename_tokens! filename_tokens
+    new_filename = new_filename_pat.make(filename_tokens)
+    rename_carefully old_filename, new_filename, opts[:dry_run]
+  end
+end
+# example_str = '/data/ripd/_com/_tw/com.twitter/bundled/_20090224/_18/bundle+20090224180354.scrape.tsv.bz2'
+# p [old_filename_pat.pattern, old_filename_pat.make_recognizer(old_token_vals), old_filename_pat.recognize(example_str)]

data/examples/rss_feeds/scrape_rss_feeds.rb ADDED Viewed

@@ -0,0 +1,52 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'monkeyshines'
+require 'monkeyshines/runner'
+require 'feedzirra'
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'monkeyshines'
+require 'monkeyshines/recursive_runner'
+WORK_DIR = Subdir[__FILE__,'work'].expand_path
+puts WORK_DIR
+#
+# Set up scrape
+#
+#
+# * jobs stream from an edamame job queue.
+# * Many jobs generate paginated requests, stopping when a response overlaps the
+#   prev_max item.
+# * Each request is fetched with the standard HTTP fetcher.
+#
+# * low-generation jobs are rescheduled based on the observed item rate
+# * jobs can spawn recursive requests. These have their request_generation
+#   incremented
+# * results are sent to a ChunkedFlatFileStore
+#
+#
+# Create runner
+#
+scraper = Monkeyshines::Runner.new({
+    :log     => { :iters => 100, :dest => Monkeyshines::CONFIG[:handle] },
+    :source  => { :type  => Monkeyshines::RequestStream::KlassHashRequestStream,
+      :store => { :type => Monkeyshines::RequestStream::EdamameQueue,
+        :queue => { :uris => ['localhost:11210'], :type => 'BeanstalkQueue', },
+        :store => { :uri =>            ':11211',  :type => 'TyrantStore',    }, }, },
+    :dest    => { :type  => :conditional_store,
+      :cache => { :uri =>              ':11212', },
+      :store => { :rootdir => WORK_DIR },},
+    # :fetcher => { :type => :fake_fetcher },
+    :force_fetch => false,
+    :sleep_time  => 0.2,
+  })
+# Execute the scrape
+loop do
+  puts Time.now
+  scraper.run
+end

data/examples/shorturls/README.textile ADDED Viewed

@@ -0,0 +1,111 @@
+This is a demonstration script showing how to inhale translations from URL shorteners such as http://tinyurl.com/ or http://bit.ly/. It tries to do so as efficiently as possible, using persistent HTTP connections for reduced load and a centralized request cache to reduce unnecessary requests.
+You can feed it a sequential list of urls or have it wander within a range of request strings.
+h2. Setup
+You will need:
+* "Wukong":http://mrflip.com mostly for several utility methods, though by the time you have a few ten million urls to process you may find it handy.
+* "Tokyo Tyrant":tokyocabinet.sourceforge.net/ or other key-value database to track which URLs have been visited.  TokyoTyrant's speed and network interface let you efficiently run many scrapers off the same central DB.  You need to get both the libraries _and_ the ruby interface for each of tokyo tyrant and tokyo cabinet.
+If you're using tokyo tyrant, you should consider optimizing the database:
+    tcrmgr optimize -port 10042 localhost 'bnum=20000000#opts=l'
+will pre-allocate 20 million buckets and a 64-bit index. (You want at least twice as many buckets as entries).
+h2. Running
+*Source of URLs to scrape*:
+_URLs taken from input files_:
+* --from-type=FlatFileStore if you want to load from a flat file stream.
+* --from should give the path to the input file: one url per line, as many as you care to supply.
+*OR*
+_URLs randomly generated in a range_:
+* --from-type=RandomUrlStream if you want to use the
+* --base-url: the domain to scrape (required).
+* --min-limit and --max-limit give a numeric range (normal base-10 number) to explore.
+* --encoding-radix: Most shorteners use base-36: the characters 0-9 and a-z are used in ascending order. Some, such as bit.ly, use base-62 (0-9a-zA-Z) by being case-sensitive: http://bit.ly/ANVgN and http://bit.ly/anvgN are different. Specify --encoding-radix=36 if the shortener ignores case, or --encoding-radix=62 if it is case sensitive. If the base-url is either bit.ly or tinyurl.com you can omit this parameter.
+*Output files*:
+* --dumpfile-chunk-time: How often to rotate output files.
+* --dumpfile-dir:        Base part of the output filename.
+* --dumpfile-pattern:    Pattern for dumpfile names. Defaults to
+  @ :dumpfile_dir/:handle_prefix/:handle/:date/:handle+:datetime-:pid.tsv @
+With --dumpfile-dir=/data/ripd --handle=bitly and the default dumpfile-pattern, the scraper will store into files named
+  /data/ripd/shortu/shorturl-bitly/20090708/shorturl-bitly+20090708123456-8342.tsv
+This may seem insane but when you've had multiple scrapers running for two months you'll thank me.
+* --cache-loc hostname:port for the requested cache. This should be a tokyo tyrant server, though it should be easy to swap it out for another distributed key-value store.
+*Other*:
+* --log: optional log file; otherwise outputs progress to the console
+h2. Output Files:
+As written, the scraper uses the cache database as only a visited-yet? flag (storing the scraped_at timestamp but nothing else.)  The actual scrape data is stored in flat files.  If you want to store everything in the database, swap out the ConditionalStore for a ReadThruStore (and perhaps back the ReadThruStore with a table-type database such as TyrantTdbKeyStore)
+h3. Output file format
+The output is stored in a series of files with tab-separated rows. Each row holds information about one url:
+  @
+  class_name (ignore)	url             	date     	code#	resp msg        destination url
+  shorturl_request	http://bit.ly/wukong	20090720003304  301     Moved   http://github.com/mrflip/wukong
+  @
+In order:
+* a dummy field giving the class name.
+* the requested URL
+* the date, stored as YYYYmmddHHMMSS
+* response_code: the "HTTP status code,":http://en.wikipedia.org/wiki/List_of_HTTP_status_codes see below for explanation. (BTW - why has nobody released a parody of "I've got hos in area codes":http://en.wikipedia.org/wiki/Area_Codes_(song) using HTTP status codes? You have disappointed me, internet.)
+* response_message: the message accompanying that response code.
+* contents: the redirect URL, or nothing if none was returned.
+h3. File Chunking
+Every four hours (or according to the --chunk-time parameter) the scraper will close the current dump file and open a new, timestamped one following the same pattern. This mitigates the damage from a corrupted file and lets you migrate the output products to S3 or other offline storage.  Make sure you include a :datetime somewhere in the filename, and at least one of :hostname or :pid if you have multiple scraper robots at work.
+h2. Scraper
+* Does a HEAD only -- the scraper doesn't request the contents of the page, only the redirect header.
+* Persistent connections -- opens one connection and
+* Backoff -- if it receives server error response codes the scraper will sleep for several seconds before attempting the next request.
+h2. Response codes:
+* 301 Moved              - the traditional status code for a redirect to the expanded url
+* 301 Moved Permanently  - this is used interchangeably by bit.ly, no idea why
+* 302 Found              - bit.ly uses this for links marked as spam -- they land you on an 'are you sure?' page on bit.ly's servers.
+* 302 Moved Temporarily  - ??? don't know the diff between _302 Moved Temporarily_ and _307 Temporary Redirect_ in theory or practice.
+* 307 Temporary Redirect - Used by some shorteners, such as budurl.com, that let you change a URL after the fact.
+Additionally, these non-redirect urls are meaningful:
+* 200 OK                 - used by tinyurl.com to indicate a nonexistent tinyurl.
+* 200 Apple              - no, really. Returned by ad.vu which does an OK and then a meta refresh. (Assumedly so they get a pageview on their ad network)
+* 404 Not Found          - For bit.ly, a removed or non-existent url string. For tinyurl, an ill-formed url string, like 22vcnf?ic or 22lsj4...some (well-formed but missing ones get a 200 OK).
+h2. Seed data
+To prevent unnecessary load on the shorteners' service, you can download several million URL expansions from infochimps.org. Feel free to contribute your efforts there as well.
+You will want to use the @bulkload_shorturls.rb@ script to fill the request sentinel cache.
+h2. See Also:
+* *On URL Shorteners*:
+** http://joshua.schachter.org/2009/04/on-url-shorteners.html
+** http://snook.ca/archives/php/url-shortener/
+** http://simonwillison.net/2009/Apr/11/revcanonical/
+* *Archive Team effort to scrape*:
+** http://archiveteam.org/index.php?title=TinyURL
+* *Base 62 encoding*:
+** http://refactormycode.com/codes/125-base-62-encoding
+** http://github.com/jtzemp/base62/tree/master

data/examples/shorturls/bulkdump_shorturls.rb ADDED Viewed

@@ -0,0 +1,46 @@
+#!/usr/bin/env ruby
+$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
+require 'rubygems'
+require 'trollop'
+require 'wukong'
+require 'monkeyshines'
+require 'shorturl_request'
+require 'shorturl_sequence'
+require 'monkeyshines/utils/uri'
+require 'time'
+#
+# Command line options
+#
+opts = Trollop::options do
+  # opt :from_type,    'Class name for scrape store to load from',  :type => String
+  # opt :from,         'URI for scrape store to load from',  :type => String
+  opt :handle,       "Handle for scrape", :type => String
+  # opt :into,         'Filename for flat TSV dump', :type => String
+  opt :log,          'File to store log', :type => String
+end
+# ******************** Log ********************
+periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 20_000, :time => 30)
+# ******************** Read From ********************
+TYRANT_PORTS = { 'tinyurl' => ":10001", 'bitly' => ":10002", 'other' => ":10003" }
+src_uri = TYRANT_PORTS[opts[:handle]] or raise "Need a handle (bitly, tinyurl or other). got: #{handle}"
+src_store = Monkeyshines::Store::TyrantTdbKeyStore.new(src_uri)
+Log.info "Loaded store with #{src_store.size}"
+# ******************** Write into ********************
+# dest_store = Monkeyshines::Store::FlatFileStore.new(opts[:into], opts.reverse_merge(:filemode => 'w'))
+RDB_PORTS = { 'tinyurl' => ":10042", 'bitly' => ":10043", 'other' => ":10044" }
+dest_uri = RDB_PORTS[opts[:handle]] or raise "Need a handle (bitly, tinyurl or other). got: #{handle}"
+dest_store = Monkeyshines::Store::TyrantRdbKeyStore.new(dest_uri)
+# src_store_klass = Wukong.class_from_resource('Monkeyshines::Store::'+opts[:from_type])
+# src_store = src_store_klass.new(opts[:from])
+Log.info "Loading into store with #{dest_store.size}"
+# ******************** Dump ********************
+src_store.each do |key, hsh|
+  periodic_log.periodically{ [src_store.size, dest_store.size, hsh.values_of('url', 'scraped_at', 'response_code', 'response_message', 'contents')] }
+  dest_store.save hsh['url'], hsh['scraped_at']
+end

data/examples/shorturls/bulkload_shorturls.rb ADDED Viewed

@@ -0,0 +1,45 @@
+#!/usr/bin/env ruby
+require 'rubygems'
+require 'tokyocabinet' ; require 'tokyotyrant'
+require 'trollop'
+$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
+require 'wukong'
+require 'monkeyshines'
+require 'shorturl_request'
+require 'shorturl_sequence'
+require 'multiplex_shorturl_cache'
+# Command Line options
+opts = Trollop::options do
+  opt :from_type,    'Class name for scrape store to load from',  :type => String
+  opt :from,         'URI for scrape store to load from',  :type => String
+  opt :handle,       "Handle for scrape", :type => String
+  opt :log,          'File to store log', :type => String
+end
+Trollop::die :from_type unless opts[:from_type]
+# ******************** Log ********************
+periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 20_000, :time => 30)
+# ******************** Load from flat file ********************
+src_store_klass = Wukong.class_from_resource('Monkeyshines::Store::'+opts[:from_type])
+src_store = src_store_klass.new(opts[:from], opts.merge(:filemode => 'r'))
+# ******************** Store into read-thru cache ********************
+RDB_PORTS = { 'tinyurl' => "localhost:10042", 'bitly' => "localhost:10043", 'other' => "localhost:10044" }
+dest_uri = RDB_PORTS[opts[:handle]] or raise "Need a handle (bitly, tinyurl or other). got: #{handle}"
+dest_store = Monkeyshines::Store::TyrantRdbKeyStore.new(dest_uri)
+# dest_store = Monkeyshines::Store::MultiplexShorturlCache.new(RDB_PORTS)
+# ******************** Dump ********************
+src_store.each do |_, url, scat, *args|
+  periodic_log.periodically{ [dest_store.size, url, scat, args] }
+  dest_store.set_nr url, scat
+end
+#
+# On a DB with 2M entries, this loads about 700/s
+# You can optimize with something like
+#   EXPECTED_MAX_KEYS = 20_000_000
+#   store.db.optimize("bnum=#{2*EXPECTED_MAX_KEYS}#opts=l") # large (64-bit), 40M buckets
+#

data/examples/shorturls/extract_urls.rb ADDED Viewed

@@ -0,0 +1,12 @@
+#!/usr/bin/env ruby
+$: << '/home/flip/ics/wukong/lib' # 'ENV['WUKONG_DIR'] if ENV['WUKONG_DIR']
+require 'wukong'
+SHORTURL_RE = %r{\Ahttp://(?:1link.in|4url.cc|6url.com|adjix.com|ad.vu|bellypath.com|bit.ly|bkite.com|budurl.com|canurl.com|chod.sk|cli.gs|decenturl.com|dn.vc|doiop.com|dwarfurl.com|easyuri.com|easyurl.net|ff.im|go2cut.com|gonext.org|hulu.com|hypem.com|ifood.tv|ilix.in|is.gd|ix.it|jdem.cz|jijr.com|kissa.be|kurl.us|litturl.com|lnkurl.com|memurl.com|metamark.net|miklos.dk|minilien.com|minurl.org|muhlink.com|myurl.in|myurl.us|notlong.com|ow.ly|plexp.com|poprl.com|qurlyq.com|redirx.com|s3nt.com|shorterlink.com|shortlinks.co.uk|short.to|shorturl.com|shrinklink.co.uk|shrinkurl.us|shrt.st|shurl.net|simurl.com|shorl.com|smarturl.eu|snipr.com|snipurl.com|snurl.com|sn.vc|starturl.com|surl.co.uk|tighturl.com|timesurl.at|tiny123.com|tiny.cc|tinylink.com|tinyurl.com|tobtr.com|traceurl.com|tr.im|tweetburner.com|twitpwr.com|twitthis.com|twurl.nl|u.mavrev.com|ur1.ca|url9.com|urlborg.com|urlbrief.com|urlcover.com|urlcut.com|urlhawk.com|url-press.com|urlsmash.com|urltea.com|urlvi.be|vimeo.com|wlink.us|xaddr.com|xil.in|xrl.us|x.se|xs.md|yatuc.com|yep.it|yweb.com|zi.ma|w3t.org)/.}i
+class Mapper < Wukong::Streamer::Base
+  def process rsrc, url, tweet_id, user_id
+    yield url if url =~ SHORTURL_RE
+  end
+end
+Wukong::Script.new(Mapper, nil, :reduce_command => '/usr/bin/uniq').run

data/examples/shorturls/multiplex_shorturl_cache.rb ADDED Viewed

@@ -0,0 +1,32 @@
+class Monkeyshines::Store::MultiplexShorturlCache < Monkeyshines::Store::ReadThruStore
+  attr_accessor :dests, :store_uris
+  # Store into tokyo tyrant
+  # TYRANT_PORTS = { 'tinyurl' => ":10001", 'bitly' => ":10002", 'other' => ":10003" }
+  def initialize store_uris, options={}
+    self.dests = { }
+    store_uris.each do |handle, uri|
+      dests[handle] = Monkeyshines::Store::ReadThruStore.new uri
+    end
+  end
+  def set key, *args, &block
+    case
+    when (key =~ %r{^http://tinyurl.com/(.*)}) then dests['tinyurl'].set($1,  *args, &block)
+    when (key =~ %r{^http://bit.ly/(.*)})      then dests['bitly'  ].set($1,  *args, &block)
+    else                                            dests['other'  ].set(key, *args, &block)
+    end
+  end
+  def size
+    dests.inject(0){|sum,hand_db| sz += hand_db[1].size }
+  end
+  def close
+    dests.each{|hdl,db| db.close }
+  end
+end

data/examples/shorturls/old/multidump_and_fix_shorturls.rb ADDED Viewed

@@ -0,0 +1,66 @@
+#!/usr/bin/env ruby
+$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
+require 'rubygems'
+require 'trollop'
+require 'wukong'
+require 'monkeyshines'
+require 'shorturl_request'
+require 'shorturl_sequence'
+require 'monkeyshines/utils/uri'
+#
+# Command line options
+#
+opts = Trollop::options do
+  opt :from_type,    'Class name for scrape store to load from',  :type => String
+  opt :from,         'URI for scrape store to load from',  :type => String
+  opt :into,         'Filename for flat TSV dump', :type => String
+  opt :log,          'File to store log', :type => String
+end
+Trollop::die :from_type unless opts[:from_type]
+# ******************** Read From ********************
+src_store_klass = Wukong.class_from_resource('Monkeyshines::Store::'+opts[:from_type])
+src_store = src_store_klass.new(opts[:from])
+Log.info "Loaded store with #{src_store.size}"
+# ******************** Write into ********************
+DUMPFILE_BASE = opts[:into]
+def make_store uri
+  Monkeyshines::Store::FlatFileStore.new "#{DUMPFILE_BASE+"-"+uri}.tsv", :filemode => 'w'
+end
+dests = { }
+[ 'tinyurl', 'bitly', 'other'
+].each do |handle|
+  dests[handle] = make_store handle
+end
+# ******************** Log ********************
+periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 20_000, :time => 30)
+# ******************** Cross Load ********************
+# Read , process, dump
+iter = 0
+src_store.each do |key, hsh|
+  hsh['contents']             ||= hsh.delete 'expanded_url'
+  hsh['response_code']          = nil if hsh['response_code']    == 'nil'
+  hsh['contents']               = nil if hsh['contents']         == 'nil'
+  unless hsh['contents'] || hsh['response_code']
+    # Log.info "removing #{hsh.inspect}"
+    src_store.db.out(key)
+    next
+  end
+  hsh['response_message']       = nil if hsh['response_message'] == 'nil'
+  hsh['url']                  ||= hsh.delete 'short_url'
+  req = ShorturlRequest.from_hash hsh
+  periodic_log.periodically{ [src_store.size, req.to_flat] }
+  req.contents = Addressable::URI.scrub_url req.contents if req.contents
+  case
+  when (key =~ %r{^http://tinyurl.com/(.*)}) then dests['tinyurl'].save req
+  when (key =~ %r{^http://bit.ly/(.*)})      then dests['bitly'  ].save req
+  else                                            dests['other'  ].save req
+  end
+  # src_store.save(key, req.to_hash.compact)
+end