monkeyshines 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +4 -0
- data/.gitignore +43 -0
- data/LICENSE +20 -0
- data/LICENSE.textile +20 -0
- data/README.textile +125 -0
- data/Rakefile +105 -0
- data/VERSION +1 -0
- data/examples/.gitignore +4 -0
- data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
- data/examples/rename_tree/rename_hdp_tree.rb +151 -0
- data/examples/rename_tree/rename_ripd_tree.rb +82 -0
- data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
- data/examples/shorturls/README.textile +111 -0
- data/examples/shorturls/bulkdump_shorturls.rb +46 -0
- data/examples/shorturls/bulkload_shorturls.rb +45 -0
- data/examples/shorturls/extract_urls.rb +12 -0
- data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
- data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
- data/examples/shorturls/old/shorturl_stats.rb +81 -0
- data/examples/shorturls/scrape_shorturls.rb +112 -0
- data/examples/shorturls/shorturl_request.rb +29 -0
- data/examples/shorturls/shorturl_sequence.rb +121 -0
- data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
- data/examples/shorturls/start_shorturl_cache.sh +2 -0
- data/lib/monkeyshines.rb +31 -0
- data/lib/monkeyshines/extensions.rb +16 -0
- data/lib/monkeyshines/fetcher.rb +10 -0
- data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
- data/lib/monkeyshines/fetcher/base.rb +44 -0
- data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
- data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
- data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
- data/lib/monkeyshines/monitor.rb +7 -0
- data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
- data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
- data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
- data/lib/monkeyshines/options.rb +59 -0
- data/lib/monkeyshines/recursive_runner.rb +26 -0
- data/lib/monkeyshines/repository/base.rb +57 -0
- data/lib/monkeyshines/repository/s3.rb +169 -0
- data/lib/monkeyshines/request_stream.rb +11 -0
- data/lib/monkeyshines/request_stream/base.rb +32 -0
- data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
- data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
- data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
- data/lib/monkeyshines/runner.rb +161 -0
- data/lib/monkeyshines/runner_core/options.rb +5 -0
- data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
- data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
- data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
- data/lib/monkeyshines/scrape_request.rb +136 -0
- data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
- data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
- data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
- data/lib/monkeyshines/store.rb +14 -0
- data/lib/monkeyshines/store/base.rb +29 -0
- data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
- data/lib/monkeyshines/store/conditional_store.rb +57 -0
- data/lib/monkeyshines/store/factory.rb +8 -0
- data/lib/monkeyshines/store/flat_file_store.rb +84 -0
- data/lib/monkeyshines/store/key_store.rb +51 -0
- data/lib/monkeyshines/store/null_store.rb +15 -0
- data/lib/monkeyshines/store/read_thru_store.rb +22 -0
- data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
- data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
- data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
- data/lib/monkeyshines/utils/factory_module.rb +106 -0
- data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
- data/lib/monkeyshines/utils/logger.rb +15 -0
- data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
- data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
- data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
- data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
- data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
- data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
- data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
- data/lib/monkeyshines/utils/trollop.rb +744 -0
- data/lib/monkeyshines/utils/union_interval.rb +52 -0
- data/lib/monkeyshines/utils/uri.rb +70 -0
- data/lib/monkeyshines/utils/uuid.rb +32 -0
- data/monkeyshines.gemspec +147 -0
- data/scrape_from_file.rb +44 -0
- data/spec/monkeyshines_spec.rb +7 -0
- data/spec/spec_helper.rb +9 -0
- metadata +183 -0
data/.document
ADDED
data/.gitignore
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
\#*
|
2
|
+
.\#*
|
3
|
+
*~
|
4
|
+
.DS_Store
|
5
|
+
Icon?
|
6
|
+
REVISION
|
7
|
+
TAGS*
|
8
|
+
nohup.out
|
9
|
+
.bzr
|
10
|
+
.hg
|
11
|
+
.svn
|
12
|
+
|
13
|
+
a.out
|
14
|
+
*.o
|
15
|
+
*.pyc
|
16
|
+
*.so
|
17
|
+
*.stackdump
|
18
|
+
*.sw?
|
19
|
+
*.tmproj
|
20
|
+
*_flymake.*
|
21
|
+
.project
|
22
|
+
.pydevproject
|
23
|
+
.settings
|
24
|
+
.tasks-cache
|
25
|
+
.yardoc
|
26
|
+
|
27
|
+
/**/*DONTVERSION*
|
28
|
+
/**/*private*
|
29
|
+
/**/cache/*
|
30
|
+
/**/log/*
|
31
|
+
/**/tmp/*
|
32
|
+
/coverage
|
33
|
+
/doc/*
|
34
|
+
/pkg/*
|
35
|
+
/rdoc/*
|
36
|
+
|
37
|
+
/db/*.sqlite3
|
38
|
+
/db/sphinx
|
39
|
+
/config/*.sphinx.conf
|
40
|
+
/config/database.yml
|
41
|
+
/config/sphinx.yml
|
42
|
+
/public/stylesheets/compiled/*
|
43
|
+
/vendor/src/**/*
|
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Philip (flip) Kromer
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/LICENSE.textile
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
---
|
2
|
+
layout: default
|
3
|
+
title: MIT License
|
4
|
+
---
|
5
|
+
|
6
|
+
h1(gemheader). {{ site.gemname }} %(small):: license%
|
7
|
+
|
8
|
+
<div class="toggle">
|
9
|
+
|
10
|
+
h2. MIT License
|
11
|
+
|
12
|
+
__Copyright (c) 2009 Philip (flip) Kromer__
|
13
|
+
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
15
|
+
|
16
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
17
|
+
|
18
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
19
|
+
|
20
|
+
</div>
|
data/README.textile
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
Monkeyshines is a tool for doing an algorithmic scrape.
|
2
|
+
|
3
|
+
It's designed to handle large-scale scrapes that may exceed the capabilities of single-machine relational databases, so it plays nicely with Hadoop / Wukong, with distributed databases (MongoDB, tokyocabinet, etc.), and distributed job queue (eg "edamame/beanstalk":http://mrflip.github.com/edamame).
|
4
|
+
|
5
|
+
---------------------------------------------------------------------------
|
6
|
+
|
7
|
+
h2. Overview
|
8
|
+
|
9
|
+
A monkeyshines scraper is simple in principle:
|
10
|
+
|
11
|
+
|
12
|
+
|
13
|
+
h2. Dependencies
|
14
|
+
|
15
|
+
This is best run standalone -- not as a gem; it's still in heavy development. I recommend cloning
|
16
|
+
|
17
|
+
* http://github.com/mrflip/edamame
|
18
|
+
* http://github.com/mrflip/wuclan
|
19
|
+
* http://github.com/mrflip/wukong
|
20
|
+
* http://github.com/mrflip/monkeyshines (this repo)
|
21
|
+
|
22
|
+
into a common directory.
|
23
|
+
|
24
|
+
Additionally, you'll need some of these gems:
|
25
|
+
|
26
|
+
* addressable (2.1.0)
|
27
|
+
* extlib (0.9.12)
|
28
|
+
* htmlentities (4.2.0)
|
29
|
+
|
30
|
+
To build the gem, you'll need
|
31
|
+
|
32
|
+
* git (1.2.2)
|
33
|
+
* jeweler (1.2.1)
|
34
|
+
* rake (0.8.7)
|
35
|
+
* rspec (1.2.6)
|
36
|
+
* rubyforge (1.0.4)
|
37
|
+
* sources (0.0.1)
|
38
|
+
|
39
|
+
And if you spell ruby with a 'j', you'll want
|
40
|
+
|
41
|
+
* jruby-openssl (0.5.2)
|
42
|
+
* json-jruby (1.1.7)
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
h2. Request Queue
|
48
|
+
|
49
|
+
h3. Periodic requests
|
50
|
+
|
51
|
+
Request stream can be metered using read-through, scheduled (eg cron), or test-and-sleep.
|
52
|
+
|
53
|
+
* Scheduled
|
54
|
+
* Test and sleep. A queue of resources is cyclically polled, sleeping whenever bored.
|
55
|
+
|
56
|
+
|
57
|
+
|
58
|
+
h2. Requests
|
59
|
+
|
60
|
+
* Base: simple fetch and store of URI. (URI specifies immutable unique resource)
|
61
|
+
* : single resource, want to check for updates over time.
|
62
|
+
* Timeline:
|
63
|
+
** Message stream, eg. twitter search or user timeline. Want to do paginated requests back to last-seen
|
64
|
+
** Feed: Poll the resource and extract contents, store by GUID. Want to poll frequently enough that single-page request gives full coverage.
|
65
|
+
|
66
|
+
---------------------------------------------------------------------------
|
67
|
+
|
68
|
+
h2. Scraper
|
69
|
+
|
70
|
+
* HttpScraper --
|
71
|
+
** JSON
|
72
|
+
** HTML
|
73
|
+
*** \0 separates records, \t separates initial fields;
|
74
|
+
*** map \ to \\, then tab, cr and newline to \t, \r and \n resp.
|
75
|
+
*** map tab, cr and newline to 	 
 and 
 resp.
|
76
|
+
|
77
|
+
|
78
|
+
x9 xa xd x7f
|
79
|
+
|
80
|
+
* HeadScraper -- records the HEAD parameters
|
81
|
+
|
82
|
+
---------------------------------------------------------------------------
|
83
|
+
|
84
|
+
h2. Store
|
85
|
+
|
86
|
+
|
87
|
+
* Flat file (chunked)
|
88
|
+
* Key store
|
89
|
+
* Read-through cache
|
90
|
+
|
91
|
+
---------------------------------------------------------------------------
|
92
|
+
|
93
|
+
h2. Periodic
|
94
|
+
|
95
|
+
* Log only every N requests, or t minutes, or whatever.
|
96
|
+
* Restart session every hour
|
97
|
+
* Close file and start new chunk every 4 hours or so. (Mitigates data loss if a file is corrupted, makes for easy batch processing).
|
98
|
+
|
99
|
+
---------------------------------------------------------------------------
|
100
|
+
|
101
|
+
h2. Pagination
|
102
|
+
|
103
|
+
h4. Session
|
104
|
+
|
105
|
+
* *Twitter Search*: Each req brings in up to 100 results in strict reverse ID (pseudo time) order. If the last item ID in a request is less than the previous scrape session's max_id, or if fewer than 100 results are returned, the scrape session is complete. We maintain two scrape_intervals: one spans from the earliest seen search hit to the highest one from the previous scrape; the other ranges backwards from the highest in _this_ scrape session (the first item in the first successful page request) to the lowest in this scrape session (the last item on the most recent successful page request).
|
106
|
+
|
107
|
+
** Set no upper limit on the first request.
|
108
|
+
** Request by page, holding the max_id fixed
|
109
|
+
** Use the lowest ID from the previous request as the new max_id
|
110
|
+
** Use the supplied 'next page' parameter
|
111
|
+
|
112
|
+
* *Twitter Followers*: Each request brings in 100 followers in reverse order of when the relationship formed. A separate call to the user can tell you how many _total_ followers there are, and you can record how many there were at end of last scrape, but there's some slop (if 100 people in the middle of the list /un/follow and 100 more people at the front /follow/ then the total will be the same). High-degree accounts may have as many as 2M followers (20,000 calls).
|
113
|
+
|
114
|
+
* *FriendFeed*: Up to four pages. Expiry given by result set of <100 results.
|
115
|
+
|
116
|
+
|
117
|
+
* Paginated: one resource, but requires one or more requests to
|
118
|
+
** Paginated + limit (max_id/since_date): rather than request by increasing page, request one page with a limit parameter until the last-on-page overlaps the previous scrape. For example, say you are scraping search results, and that when you last made the request the max ID was 120_000; the current max_id is 155_000. Request the first page (no limit). Using the last result on each page as the new limit_id until that last result is less than 120_000.
|
119
|
+
** Paginated + stop_on_duplicate: request pages until the last one on the page matches an already-requested instance.
|
120
|
+
** Paginated + velocity_estimate: . For example, say a user acquires on average 4.1 followers/day and it has been 80 days since last scrape. With 100 followers/req you will want to request ceil( 4.1 * 80 / 100 ) = 4 pages.
|
121
|
+
|
122
|
+
h4. Rescheduling
|
123
|
+
|
124
|
+
Want to perform next scrape to give a couple pages or a mostly-full page. Need to track a rate (num_items / timespan), clamped to a min_reschedule / max_reschedule bounds.
|
125
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
8
|
+
gem.name = "monkeyshines"
|
9
|
+
gem.summary = %Q{A simple scraper for directed scrapes of APIs, feed or structured HTML.}
|
10
|
+
gem.description = %Q{A simple scraper for directed scrapes of APIs, feed or structured HTML. Plays nicely with wuclan and wukong.}
|
11
|
+
gem.email = "flip@infochimps.org"
|
12
|
+
gem.homepage = "http://github.com/mrflip/monkeyshines"
|
13
|
+
gem.authors = ["Philip (flip) Kromer"]
|
14
|
+
gem.add_dependency 'addressable'
|
15
|
+
gem.add_dependency 'uuid'
|
16
|
+
gem.add_dependency 'wukong'
|
17
|
+
end
|
18
|
+
Jeweler::GemcutterTasks.new
|
19
|
+
rescue LoadError
|
20
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
21
|
+
end
|
22
|
+
|
23
|
+
require 'spec/rake/spectask'
|
24
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
25
|
+
spec.libs << 'lib' << 'spec'
|
26
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
27
|
+
end
|
28
|
+
|
29
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
30
|
+
spec.libs << 'lib' << 'spec'
|
31
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
32
|
+
spec.rcov = true
|
33
|
+
end
|
34
|
+
task :spec => :check_dependencies
|
35
|
+
task :default => :spec
|
36
|
+
|
37
|
+
begin
|
38
|
+
require 'reek/rake_task'
|
39
|
+
Reek::RakeTask.new do |t|
|
40
|
+
t.fail_on_error = true
|
41
|
+
t.verbose = false
|
42
|
+
t.source_files = ['lib/**/*.rb', 'examples/**/*.rb']
|
43
|
+
end
|
44
|
+
rescue LoadError
|
45
|
+
task :reek do
|
46
|
+
abort "Reek is not available. In order to run reek, you must: sudo gem install reek"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
begin
|
51
|
+
require 'roodi'
|
52
|
+
require 'roodi_task'
|
53
|
+
RoodiTask.new do |t|
|
54
|
+
t.verbose = false
|
55
|
+
end
|
56
|
+
rescue LoadError
|
57
|
+
task :roodi do
|
58
|
+
abort "Roodi is not available. In order to run roodi, you must: sudo gem install roodi"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
begin
|
63
|
+
require 'yard'
|
64
|
+
YARD::Rake::YardocTask.new do |yard|
|
65
|
+
end
|
66
|
+
rescue LoadError
|
67
|
+
task :yardoc do
|
68
|
+
abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
require 'rake/rdoctask'
|
73
|
+
Rake::RDocTask.new do |rdoc|
|
74
|
+
require 'rdoc'
|
75
|
+
if File.exist?('VERSION')
|
76
|
+
version = File.read('VERSION')
|
77
|
+
else
|
78
|
+
version = ""
|
79
|
+
end
|
80
|
+
|
81
|
+
rdoc.options += [
|
82
|
+
'-SHN',
|
83
|
+
'-f', 'darkfish', # use darkfish rdoc styler
|
84
|
+
]
|
85
|
+
rdoc.rdoc_dir = 'rdoc'
|
86
|
+
rdoc.title = "edamame #{version}"
|
87
|
+
#
|
88
|
+
File.open(File.dirname(__FILE__)+'/.document').each{|line| rdoc.rdoc_files.include(line.chomp) }
|
89
|
+
end
|
90
|
+
|
91
|
+
require 'rake/rdoctask'
|
92
|
+
Rake::RDocTask.new do |rdoc|
|
93
|
+
if File.exist?('VERSION.yml')
|
94
|
+
config = YAML.load(File.read('VERSION.yml'))
|
95
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
96
|
+
else
|
97
|
+
version = ""
|
98
|
+
end
|
99
|
+
|
100
|
+
rdoc.rdoc_dir = 'rdoc'
|
101
|
+
rdoc.title = "monkeyshines #{version}"
|
102
|
+
rdoc.rdoc_files.include('README*')
|
103
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
104
|
+
end
|
105
|
+
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.2
|
data/examples/.gitignore
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'monkeyshines'
|
4
|
+
require 'monkeyshines/runner'
|
5
|
+
require 'pathname'
|
6
|
+
|
7
|
+
#
|
8
|
+
#
|
9
|
+
#
|
10
|
+
require 'wuclan/twitter'
|
11
|
+
# un-namespace request classes.
|
12
|
+
include Wuclan::Twitter::Scrape
|
13
|
+
|
14
|
+
Monkeyshines::WORK_DIR = '/tmp'
|
15
|
+
WORK_DIR = Pathname.new(Monkeyshines::WORK_DIR).realpath.to_s
|
16
|
+
|
17
|
+
# ===========================================================================
|
18
|
+
#
|
19
|
+
# scrape_shorturls.rb --
|
20
|
+
#
|
21
|
+
# To scrape from a list of shortened urls:
|
22
|
+
#
|
23
|
+
# ./shorturl_random_scrape.rb --from-type=FlatFileStore --from=request_urls.tsv
|
24
|
+
#
|
25
|
+
# To do a random scrape:
|
26
|
+
#
|
27
|
+
# ./shorturl_random_scrape.rb --from-type=RandomUrlStream --base-url=tinyurl.com
|
28
|
+
# --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix=
|
29
|
+
#
|
30
|
+
#
|
31
|
+
opts = Trollop::options do
|
32
|
+
opt :log, "Log to file instead of STDERR"
|
33
|
+
# input from file
|
34
|
+
opt :from, "URI for scrape store to load from", :type => String
|
35
|
+
opt :skip, "Initial lines to skip", :type => Integer
|
36
|
+
# output storage
|
37
|
+
opt :cache_uri, "URI for cache server", :type => String, :default => ':1978'
|
38
|
+
opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
39
|
+
opt :dest_dir, "Filename base to store output. default ./work/ripd", :default => WORK_DIR+'/ripd'
|
40
|
+
opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:date/:handle+:timestamp-:pid.tsv"
|
41
|
+
opt :into, "URI for scrape store into", :type => String
|
42
|
+
end
|
43
|
+
opts[:handle] ||= 'com.twitter'
|
44
|
+
scrape_config = YAML.load(File.open(ENV['HOME']+'/.monkeyshines'))
|
45
|
+
opts.merge! scrape_config
|
46
|
+
|
47
|
+
# ******************** Log ********************
|
48
|
+
if (opts[:log])
|
49
|
+
opts[:log] = (WORK_DIR+'/log/'+File.basename(opts[:from],'.tsv'))
|
50
|
+
$stdout = $stderr = File.open(opts[:log]+"-console.log", "a")
|
51
|
+
end
|
52
|
+
|
53
|
+
#
|
54
|
+
# Execute the scrape
|
55
|
+
#
|
56
|
+
scraper = Monkeyshines::Runner.new(
|
57
|
+
:dest_store => { :type => :conditional_store,
|
58
|
+
:cache => { :type => :tyrant_rdb_key_store, :uri => opts[:cache_uri] },
|
59
|
+
:store => opts.merge({ :type => :chunked_flat_file_store }), },
|
60
|
+
# :store => { :type => :flat_file_store, :filename => opts[:into] }, },
|
61
|
+
:request_stream => { :type => :base, :klass => Monkeyshines::ScrapeRequest,
|
62
|
+
:store => { :type => :flat_file_store, :filemode => 'r', :filename => opts[:from] } }
|
63
|
+
)
|
64
|
+
scraper.run
|
@@ -0,0 +1,151 @@
|
|
1
|
+
#!/usr/bin/env ruby19
|
2
|
+
$: << ENV['HOME']+'/ics/rubygems/trollop-1.14/lib'
|
3
|
+
$: << ENV['HOME']+'/ics/rubygems/log4r-1.0.5/src'
|
4
|
+
require "monkeyshines/utils/logger"
|
5
|
+
require "monkeyshines/utils/filename_pattern.rb"; include Monkeyshines::Utils
|
6
|
+
require 'wukong'
|
7
|
+
require 'fileutils'
|
8
|
+
require 'trollop'
|
9
|
+
|
10
|
+
#
|
11
|
+
# This script demonstrates the use of FilenamePattern.
|
12
|
+
#
|
13
|
+
# The details are meaningless (it's a throwaway script I used to move to a more
|
14
|
+
# unified naming scheme for scraped files), but I think it nicely demonstrates
|
15
|
+
# how useful the FilenamePattern class can be.
|
16
|
+
#
|
17
|
+
|
18
|
+
opts = Trollop::options do
|
19
|
+
opt :root, "base dir to move (tw0227, etc)", :required => true, :type => String
|
20
|
+
opt :go, "actually do rename (otherwise do a dry run)"
|
21
|
+
end
|
22
|
+
|
23
|
+
# The tree to walk
|
24
|
+
RIPD_ROOT = '/user/flip/ripd'
|
25
|
+
|
26
|
+
#
|
27
|
+
# Old files to rename
|
28
|
+
#
|
29
|
+
old_filename_pats = {
|
30
|
+
# "/user/flip/#{opts[:root]}/bundled/bundled_bundled_*/*.scrape.tsv" =>
|
31
|
+
# ':any_id/bundled/bundled_bundled_:date/bundle+:timestamp.scrape.:ext',
|
32
|
+
# "/user/flip/#{opts[:root]}/bundled/bundled_fff_*/*fff_*-0*" => {
|
33
|
+
# :patt => ':any_id/bundled/bundled_fff_:date-:any_id/:{flavor}_:date-:segment',
|
34
|
+
# :toks => { :ext => 'tsv' } }
|
35
|
+
# "/user/flip/#{opts[:root]}/bundled/bundled_bundled_*/bundled-_20*.tsv" => {
|
36
|
+
# :patt => ':any_id/bundled/bundled_bundled_:date/bundled-_:date.:ext',
|
37
|
+
# :toks => { :flavor => 'bundled', :time => '000000' } }
|
38
|
+
# "/user/flip/#{opts[:root]}/bundled/bundled_idok_*/*idok_*-0*" => {
|
39
|
+
# :patt => ':any_id/bundled/bundled_idok_0126_pt_0215-:any_id/:{flavor}_0126_pt_0215-:segment',
|
40
|
+
# :toks => { :ext => 'tsv', :date => '20090215', :flavor => 'bundled_idok' } }
|
41
|
+
# "/user/flip/#{opts[:root]}/bundled/bundled_bundled_*/bundled-_*.tsv" =>
|
42
|
+
# ':any_id/bundled/bundled_bundled_:date/bundled-_:date.:ext',
|
43
|
+
# '/user/flip/ripd/com.twitter.stream/hosebird-*' =>
|
44
|
+
# '/user/flip/ripd/:handle/hosebird-:date-:time.:ext',
|
45
|
+
# "/user/flip/#{opts[:root]}/bundled/bundled_public_timeline_*/bundled_public_timeline_*.tsv" => {
|
46
|
+
# :patt => ':any_id/bundled/bundled_public_timeline_:date/bundled_public_timeline_:date.:ext',
|
47
|
+
# :toks => { :hostname => 'old+timeline' } }
|
48
|
+
# "/user/flip/#{opts[:root]}/bundled/bundled_public_timeline_*/bundled_public_timeline_*[0-9]" => {
|
49
|
+
# :patt => ':any_id/bundled/bundled_public_timeline_:date/bundled_public_timeline_:date',
|
50
|
+
# :toks => { :hostname => 'old+timeline', :ext => 'tsv' } }
|
51
|
+
"/user/flip/#{opts[:root]}/bundled/bundled_public_timeline_*/part-*[0-9]" => {
|
52
|
+
:patt => ':any_id/bundled/bundled_public_timeline_:date/part-:segment',
|
53
|
+
:toks => { :flavor => 'timeline', :ext => 'tsv' } }
|
54
|
+
}
|
55
|
+
|
56
|
+
#
|
57
|
+
# How to template new filename
|
58
|
+
#
|
59
|
+
new_token_defaults = {
|
60
|
+
:dest_dir => RIPD_ROOT,
|
61
|
+
:pid => '0',
|
62
|
+
:hostname => 'old',
|
63
|
+
:handle => 'com.twitter'
|
64
|
+
}
|
65
|
+
new_filename_pat = FilenamePattern.new(
|
66
|
+
':dest_dir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid-:hostname+:flavor.:ext', new_token_defaults)
|
67
|
+
|
68
|
+
MADE_DIR = { }
|
69
|
+
#
|
70
|
+
# Rename with logging and without overwriting
|
71
|
+
#
|
72
|
+
def rename_carefully old_file, new_filename, do_it=false
|
73
|
+
Log.info "%s%-87s\t=> %s" % [do_it ? "" : "DRY RUN ", old_file.path, new_filename]
|
74
|
+
return unless do_it
|
75
|
+
dirname = File.dirname(new_filename)
|
76
|
+
if !MADE_DIR[dirname] then Wukong::Dfs::HFile.mkdir_p(dirname) ; MADE_DIR[dirname] = true ; end
|
77
|
+
old_file.mv new_filename
|
78
|
+
end
|
79
|
+
|
80
|
+
#
|
81
|
+
# Do this thing
|
82
|
+
#
|
83
|
+
old_filename_pats.each do |files_to_rename, old_filename_rule|
|
84
|
+
Log.info "Renaming files matching #{files_to_rename}"
|
85
|
+
if old_filename_rule.is_a? Hash
|
86
|
+
old_filename_pat = FilenamePattern.new(old_filename_rule[:patt])
|
87
|
+
more_toks = old_filename_rule[:toks] || { }
|
88
|
+
else
|
89
|
+
old_filename_pat = FilenamePattern.new(old_filename_rule)
|
90
|
+
more_toks = { }
|
91
|
+
end
|
92
|
+
#
|
93
|
+
# List files and rename
|
94
|
+
#
|
95
|
+
Wukong::Dfs.list_files(files_to_rename).each do |hdfs_file|
|
96
|
+
filename_tokens = old_filename_pat.recognize(hdfs_file.path, { :segment => '\d+', :flavor => '\w+'}) or next
|
97
|
+
filename_tokens.merge!(more_toks)
|
98
|
+
if (filename_tokens[:timestamp].blank?) && (!filename_tokens[:date].blank?)
|
99
|
+
timepart = filename_tokens[:time].blank? ? ('0'+filename_tokens[:segment]) : filename_tokens[:time]
|
100
|
+
filename_tokens[:timestamp] = filename_tokens[:date] + (timepart || '000000')
|
101
|
+
end
|
102
|
+
new_filename = new_filename_pat.make(filename_tokens)
|
103
|
+
rename_carefully hdfs_file, new_filename, opts[:go]
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
|
108
|
+
# '/user/flip/pkgd/user/flip/tw0227/bundled/bundled_bundled_*.bz2' =>
|
109
|
+
# { :pat_str => ':any_id/bundled/bundled_bundled_:date.:ext',
|
110
|
+
# :toks => { :ext => '.tsv.bz2' } },
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
|
115
|
+
# -rw-r--r-- 3 flip supergroup 2055552674 2009-02-18 13:18 /user/flip/tw0218/bundled/bundled_fff_20090126-00000/bundled_fff_20090126-00000
|
116
|
+
# -rw-r--r-- 3 flip supergroup 2328853732 2009-02-18 13:08 /user/flip/tw0218/bundled/bundled_fff_20090126-00001/bundled_fff_20090126-00001
|
117
|
+
# -rw-r--r-- 3 flip supergroup 630259166 2009-02-18 13:55 /user/flip/tw0218/bundled/bundled_idok_0126_pt_0215-00053/bundled_idok_0126_pt_0215-00053
|
118
|
+
# -rw-r--r-- 3 flip supergroup 1714844022 2009-02-17 12:17 /user/flip/tw0218/bundled/bundled_bundled_20090118/bundled-_20090118.tsv
|
119
|
+
# -rw-r--r-- 3 flip supergroup 4053904382 2009-02-17 12:18 /user/flip/tw0218/bundled/bundled_bundled_20090119/bundled-_20090119.tsv
|
120
|
+
# -rw-r--r-- 3 flip supergroup 3612882035 2009-02-17 12:36 /user/flip/tw0218/bundled/bundled_bundled_20090120/bundled-_20090120.tsv
|
121
|
+
# -rw-r--r-- 3 flip supergroup 4309364084 2009-02-17 12:42 /user/flip/tw0218/bundled/bundled_bundled_20090121/bundled-_20090121.tsv
|
122
|
+
# -rw-r--r-- 3 flip supergroup 4375598899 2009-02-17 12:49 /user/flip/tw0218/bundled/bundled_bundled_20090122/bundled-_20090122.tsv
|
123
|
+
# -rw-r--r-- 3 flip supergroup 2414994564 2009-02-17 12:56 /user/flip/tw0218/bundled/bundled_bundled_20090123/bundled-_20090123.tsv
|
124
|
+
# -rw-r--r-- 3 flip supergroup 612 2009-02-17 13:01 /user/flip/tw0218/bundled/bundled_bundled_20090125/bundled-_20090125.tsv
|
125
|
+
# -rw-r--r-- 3 flip supergroup 1120007814 2009-02-17 13:03 /user/flip/tw0218/bundled/bundled_bundled_20090204/bundled-_20090204.tsv
|
126
|
+
# -rw-r--r-- 3 flip supergroup 534874538 2009-02-17 13:06 /user/flip/tw0218/bundled/bundled_bundled_20090205/bundled-_20090205.tsv
|
127
|
+
# -rw-r--r-- 3 flip supergroup 404436617 2009-02-17 13:07 /user/flip/tw0218/bundled/bundled_bundled_20090206/bundled-_20090206.tsv
|
128
|
+
# -rw-r--r-- 3 flip supergroup 359037171 2009-02-17 13:08 /user/flip/tw0218/bundled/bundled_bundled_20090207/bundled-_20090207.tsv
|
129
|
+
# -rw-r--r-- 3 flip supergroup 332668257 2009-02-17 13:08 /user/flip/tw0218/bundled/bundled_bundled_20090208/bundled-_20090208.tsv
|
130
|
+
# -rw-r--r-- 3 flip supergroup 304904205 2009-02-17 13:09 /user/flip/tw0218/bundled/bundled_bundled_20090209/bundled-_20090209.tsv
|
131
|
+
# -rw-r--r-- 3 flip supergroup 295217809 2009-02-17 13:09 /user/flip/tw0218/bundled/bundled_bundled_20090210/bundled-_20090210.tsv
|
132
|
+
# -rw-r--r-- 3 flip supergroup 257376099 2009-02-17 13:10 /user/flip/tw0218/bundled/bundled_bundled_20090211/bundled-_20090211.tsv
|
133
|
+
# -rw-r--r-- 3 flip supergroup 180147925 2009-02-17 13:10 /user/flip/tw0218/bundled/bundled_bundled_20090212/bundled-_20090212.tsv
|
134
|
+
# -rw-r--r-- 3 flip supergroup 150611510 2009-02-17 13:11 /user/flip/tw0218/bundled/bundled_bundled_20090214/bundled-_20090214.tsv
|
135
|
+
# -rw-r--r-- 3 flip supergroup 154181256 2009-02-17 13:11 /user/flip/tw0218/bundled/bundled_bundled_20090215/bundled-_20090215.tsv
|
136
|
+
# -rw-r--r-- 3 flip supergroup 74288574 2009-02-17 13:12 /user/flip/tw0218/bundled/bundled_bundled_20090216/bundled-_20090216.tsv
|
137
|
+
# -rw-r--r-- 3 flip supergroup 2006507 2009-02-17 13:12 /user/flip/tw0218/bundled/bundled_bundled_20090217/bundled-_20090217.tsv
|
138
|
+
# -rw-r--r-- 3 flip supergroup 232422855 2009-02-17 13:11 /user/flip/tw0219/bundled/bundled_bundled_20090213/bundled-_20090213.tsv
|
139
|
+
# -rw-r--r-- 3 flip supergroup 558290288 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00004
|
140
|
+
# -rw-r--r-- 3 flip supergroup 1130590440 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00009
|
141
|
+
# -rw-r--r-- 3 flip supergroup 523600649 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00025
|
142
|
+
# -rw-r--r-- 3 flip supergroup 565480025 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00028
|
143
|
+
# -rw-r--r-- 3 flip supergroup 566689087 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00033
|
144
|
+
# -rw-r--r-- 3 flip supergroup 545436522 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00036
|
145
|
+
# -rw-r--r-- 3 flip supergroup 563565767 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00039
|
146
|
+
# -rw-r--r-- 3 flip supergroup 544478849 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00046
|
147
|
+
# -rw-r--r-- 3 flip supergroup 566687292 2009-02-27 16:52 /user/flip/tw0227/bundled/bundled_public_timeline_20090227/part-00055
|
148
|
+
|
149
|
+
# -rw-r--r-- 3 flip supergroup 561407978 2009-03-03 01:28 /user/flip/tw0227/bundled/bundled_public_timeline_20090227-0301/bundled_public_timeline_20090227-0301.tsv
|
150
|
+
# -rw-r--r-- 3 flip supergroup 559109582 2009-03-03 01:22 /user/flip/tw0227/bundled/bundled_public_timeline_20090227-0302/bundled_public_timeline_20090227-0302.tsv
|
151
|
+
# -rw-r--r-- 3 flip supergroup 1126272691 2009-03-01 04:53 /user/flip/tw0227/bundled/bundled_public_timeline_20090227-27_28/bundled_public_timeline_20090227-27_28.tsv
|