monkeyshines 0.0.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.textile +2 -2
- data/README.textile +6 -20
- data/examples/README.textile +6 -0
- metadata +2 -10
- data/.document +0 -4
- data/.gitignore +0 -43
- data/LICENSE +0 -20
- data/Rakefile +0 -105
- data/VERSION +0 -1
- data/examples/.gitignore +0 -4
- data/monkeyshines.gemspec +0 -147
- data/scrape_from_file.rb +0 -44
data/LICENSE.textile
CHANGED
@@ -5,7 +5,7 @@ title: MIT License
|
|
5
5
|
|
6
6
|
h1(gemheader). {{ site.gemname }} %(small):: license%
|
7
7
|
|
8
|
-
<div class="toggle">
|
8
|
+
<notextile><div class="toggle"></notextile>
|
9
9
|
|
10
10
|
h2. MIT License
|
11
11
|
|
@@ -17,4 +17,4 @@ The above copyright notice and this permission notice shall be included in all c
|
|
17
17
|
|
18
18
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
19
19
|
|
20
|
-
|
20
|
+
<notextile></div></notextile>
|
data/README.textile
CHANGED
@@ -2,15 +2,7 @@ Monkeyshines is a tool for doing an algorithmic scrape.
|
|
2
2
|
|
3
3
|
It's designed to handle large-scale scrapes that may exceed the capabilities of single-machine relational databases, so it plays nicely with Hadoop / Wukong, with distributed databases (MongoDB, tokyocabinet, etc.), and distributed job queue (eg "edamame/beanstalk":http://mrflip.github.com/edamame).
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
h2. Overview
|
8
|
-
|
9
|
-
A monkeyshines scraper is simple in principle:
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
h2. Dependencies
|
5
|
+
h2. Install
|
14
6
|
|
15
7
|
This is best run standalone -- not as a gem; it's still in heavy development. I recommend cloning
|
16
8
|
|
@@ -27,22 +19,18 @@ Additionally, you'll need some of these gems:
|
|
27
19
|
* extlib (0.9.12)
|
28
20
|
* htmlentities (4.2.0)
|
29
21
|
|
30
|
-
To build the gem, you'll need
|
31
|
-
|
32
|
-
* git (1.2.2)
|
33
|
-
* jeweler (1.2.1)
|
34
|
-
* rake (0.8.7)
|
35
|
-
* rspec (1.2.6)
|
36
|
-
* rubyforge (1.0.4)
|
37
|
-
* sources (0.0.1)
|
38
|
-
|
39
22
|
And if you spell ruby with a 'j', you'll want
|
40
23
|
|
41
24
|
* jruby-openssl (0.5.2)
|
42
25
|
* json-jruby (1.1.7)
|
43
26
|
|
27
|
+
---------------------------------------------------------------------------
|
28
|
+
|
29
|
+
h2. Help!
|
44
30
|
|
31
|
+
Send Monkeyshines questions to the "Infinite Monkeywrench mailing list":http://groups.google.com/group/infochimps-code
|
45
32
|
|
33
|
+
---------------------------------------------------------------------------
|
46
34
|
|
47
35
|
h2. Request Queue
|
48
36
|
|
@@ -53,8 +41,6 @@ Request stream can be metered using read-through, scheduled (eg cron), or test-a
|
|
53
41
|
* Scheduled
|
54
42
|
* Test and sleep. A queue of resources is cyclically polled, sleeping whenever bored.
|
55
43
|
|
56
|
-
|
57
|
-
|
58
44
|
h2. Requests
|
59
45
|
|
60
46
|
* Base: simple fetch and store of URI. (URI specifies immutable unique resource)
|
@@ -0,0 +1,6 @@
|
|
1
|
+
|
2
|
+
h2. Where the hell?
|
3
|
+
|
4
|
+
Most of these examples live in the "wuclan":http://mrflip.github.com/wuclan/ repository -- the directories are symlinks into that repo.
|
5
|
+
|
6
|
+
Either install them from git source in adjoining directories, or just head into the wuclan examples/ directory and run stuff from there.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: monkeyshines
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Philip (flip) Kromer
|
@@ -49,18 +49,12 @@ executables: []
|
|
49
49
|
extensions: []
|
50
50
|
|
51
51
|
extra_rdoc_files:
|
52
|
-
- LICENSE
|
53
52
|
- LICENSE.textile
|
54
53
|
- README.textile
|
55
54
|
files:
|
56
|
-
- .document
|
57
|
-
- .gitignore
|
58
|
-
- LICENSE
|
59
55
|
- LICENSE.textile
|
60
56
|
- README.textile
|
61
|
-
-
|
62
|
-
- VERSION
|
63
|
-
- examples/.gitignore
|
57
|
+
- examples/README.textile
|
64
58
|
- examples/bulk_urls/scrape_bulk_urls.rb
|
65
59
|
- examples/rename_tree/rename_hdp_tree.rb
|
66
60
|
- examples/rename_tree/rename_ripd_tree.rb
|
@@ -133,8 +127,6 @@ files:
|
|
133
127
|
- lib/monkeyshines/utils/union_interval.rb
|
134
128
|
- lib/monkeyshines/utils/uri.rb
|
135
129
|
- lib/monkeyshines/utils/uuid.rb
|
136
|
-
- monkeyshines.gemspec
|
137
|
-
- scrape_from_file.rb
|
138
130
|
- spec/monkeyshines_spec.rb
|
139
131
|
- spec/spec_helper.rb
|
140
132
|
has_rdoc: true
|
data/.document
DELETED
data/.gitignore
DELETED
@@ -1,43 +0,0 @@
|
|
1
|
-
\#*
|
2
|
-
.\#*
|
3
|
-
*~
|
4
|
-
.DS_Store
|
5
|
-
Icon?
|
6
|
-
REVISION
|
7
|
-
TAGS*
|
8
|
-
nohup.out
|
9
|
-
.bzr
|
10
|
-
.hg
|
11
|
-
.svn
|
12
|
-
|
13
|
-
a.out
|
14
|
-
*.o
|
15
|
-
*.pyc
|
16
|
-
*.so
|
17
|
-
*.stackdump
|
18
|
-
*.sw?
|
19
|
-
*.tmproj
|
20
|
-
*_flymake.*
|
21
|
-
.project
|
22
|
-
.pydevproject
|
23
|
-
.settings
|
24
|
-
.tasks-cache
|
25
|
-
.yardoc
|
26
|
-
|
27
|
-
/**/*DONTVERSION*
|
28
|
-
/**/*private*
|
29
|
-
/**/cache/*
|
30
|
-
/**/log/*
|
31
|
-
/**/tmp/*
|
32
|
-
/coverage
|
33
|
-
/doc/*
|
34
|
-
/pkg/*
|
35
|
-
/rdoc/*
|
36
|
-
|
37
|
-
/db/*.sqlite3
|
38
|
-
/db/sphinx
|
39
|
-
/config/*.sphinx.conf
|
40
|
-
/config/database.yml
|
41
|
-
/config/sphinx.yml
|
42
|
-
/public/stylesheets/compiled/*
|
43
|
-
/vendor/src/**/*
|
data/LICENSE
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
Copyright (c) 2009 Philip (flip) Kromer
|
2
|
-
|
3
|
-
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
-
a copy of this software and associated documentation files (the
|
5
|
-
"Software"), to deal in the Software without restriction, including
|
6
|
-
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
-
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
-
permit persons to whom the Software is furnished to do so, subject to
|
9
|
-
the following conditions:
|
10
|
-
|
11
|
-
The above copyright notice and this permission notice shall be
|
12
|
-
included in all copies or substantial portions of the Software.
|
13
|
-
|
14
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
-
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
-
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
-
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
-
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
-
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
-
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/Rakefile
DELETED
@@ -1,105 +0,0 @@
|
|
1
|
-
require 'rubygems'
|
2
|
-
require 'rake'
|
3
|
-
|
4
|
-
begin
|
5
|
-
require 'jeweler'
|
6
|
-
Jeweler::Tasks.new do |gem|
|
7
|
-
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
8
|
-
gem.name = "monkeyshines"
|
9
|
-
gem.summary = %Q{A simple scraper for directed scrapes of APIs, feed or structured HTML.}
|
10
|
-
gem.description = %Q{A simple scraper for directed scrapes of APIs, feed or structured HTML. Plays nicely with wuclan and wukong.}
|
11
|
-
gem.email = "flip@infochimps.org"
|
12
|
-
gem.homepage = "http://github.com/mrflip/monkeyshines"
|
13
|
-
gem.authors = ["Philip (flip) Kromer"]
|
14
|
-
gem.add_dependency 'addressable'
|
15
|
-
gem.add_dependency 'uuid'
|
16
|
-
gem.add_dependency 'wukong'
|
17
|
-
end
|
18
|
-
Jeweler::GemcutterTasks.new
|
19
|
-
rescue LoadError
|
20
|
-
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
21
|
-
end
|
22
|
-
|
23
|
-
require 'spec/rake/spectask'
|
24
|
-
Spec::Rake::SpecTask.new(:spec) do |spec|
|
25
|
-
spec.libs << 'lib' << 'spec'
|
26
|
-
spec.spec_files = FileList['spec/**/*_spec.rb']
|
27
|
-
end
|
28
|
-
|
29
|
-
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
30
|
-
spec.libs << 'lib' << 'spec'
|
31
|
-
spec.pattern = 'spec/**/*_spec.rb'
|
32
|
-
spec.rcov = true
|
33
|
-
end
|
34
|
-
task :spec => :check_dependencies
|
35
|
-
task :default => :spec
|
36
|
-
|
37
|
-
begin
|
38
|
-
require 'reek/rake_task'
|
39
|
-
Reek::RakeTask.new do |t|
|
40
|
-
t.fail_on_error = true
|
41
|
-
t.verbose = false
|
42
|
-
t.source_files = ['lib/**/*.rb', 'examples/**/*.rb']
|
43
|
-
end
|
44
|
-
rescue LoadError
|
45
|
-
task :reek do
|
46
|
-
abort "Reek is not available. In order to run reek, you must: sudo gem install reek"
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
begin
|
51
|
-
require 'roodi'
|
52
|
-
require 'roodi_task'
|
53
|
-
RoodiTask.new do |t|
|
54
|
-
t.verbose = false
|
55
|
-
end
|
56
|
-
rescue LoadError
|
57
|
-
task :roodi do
|
58
|
-
abort "Roodi is not available. In order to run roodi, you must: sudo gem install roodi"
|
59
|
-
end
|
60
|
-
end
|
61
|
-
|
62
|
-
begin
|
63
|
-
require 'yard'
|
64
|
-
YARD::Rake::YardocTask.new do |yard|
|
65
|
-
end
|
66
|
-
rescue LoadError
|
67
|
-
task :yardoc do
|
68
|
-
abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
require 'rake/rdoctask'
|
73
|
-
Rake::RDocTask.new do |rdoc|
|
74
|
-
require 'rdoc'
|
75
|
-
if File.exist?('VERSION')
|
76
|
-
version = File.read('VERSION')
|
77
|
-
else
|
78
|
-
version = ""
|
79
|
-
end
|
80
|
-
|
81
|
-
rdoc.options += [
|
82
|
-
'-SHN',
|
83
|
-
'-f', 'darkfish', # use darkfish rdoc styler
|
84
|
-
]
|
85
|
-
rdoc.rdoc_dir = 'rdoc'
|
86
|
-
rdoc.title = "edamame #{version}"
|
87
|
-
#
|
88
|
-
File.open(File.dirname(__FILE__)+'/.document').each{|line| rdoc.rdoc_files.include(line.chomp) }
|
89
|
-
end
|
90
|
-
|
91
|
-
require 'rake/rdoctask'
|
92
|
-
Rake::RDocTask.new do |rdoc|
|
93
|
-
if File.exist?('VERSION.yml')
|
94
|
-
config = YAML.load(File.read('VERSION.yml'))
|
95
|
-
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
96
|
-
else
|
97
|
-
version = ""
|
98
|
-
end
|
99
|
-
|
100
|
-
rdoc.rdoc_dir = 'rdoc'
|
101
|
-
rdoc.title = "monkeyshines #{version}"
|
102
|
-
rdoc.rdoc_files.include('README*')
|
103
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
104
|
-
end
|
105
|
-
|
data/VERSION
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
0.0.2
|
data/examples/.gitignore
DELETED
data/monkeyshines.gemspec
DELETED
@@ -1,147 +0,0 @@
|
|
1
|
-
# Generated by jeweler
|
2
|
-
# DO NOT EDIT THIS FILE
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
4
|
-
# -*- encoding: utf-8 -*-
|
5
|
-
|
6
|
-
Gem::Specification.new do |s|
|
7
|
-
s.name = %q{monkeyshines}
|
8
|
-
s.version = "0.0.2"
|
9
|
-
|
10
|
-
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
-
s.authors = ["Philip (flip) Kromer"]
|
12
|
-
s.date = %q{2009-10-12}
|
13
|
-
s.description = %q{A simple scraper for directed scrapes of APIs, feed or structured HTML. Plays nicely with wuclan and wukong.}
|
14
|
-
s.email = %q{flip@infochimps.org}
|
15
|
-
s.extra_rdoc_files = [
|
16
|
-
"LICENSE",
|
17
|
-
"LICENSE.textile",
|
18
|
-
"README.textile"
|
19
|
-
]
|
20
|
-
s.files = [
|
21
|
-
".document",
|
22
|
-
".gitignore",
|
23
|
-
"LICENSE",
|
24
|
-
"LICENSE.textile",
|
25
|
-
"README.textile",
|
26
|
-
"Rakefile",
|
27
|
-
"VERSION",
|
28
|
-
"examples/.gitignore",
|
29
|
-
"examples/bulk_urls/scrape_bulk_urls.rb",
|
30
|
-
"examples/rename_tree/rename_hdp_tree.rb",
|
31
|
-
"examples/rename_tree/rename_ripd_tree.rb",
|
32
|
-
"examples/rss_feeds/scrape_rss_feeds.rb",
|
33
|
-
"examples/shorturls/README.textile",
|
34
|
-
"examples/shorturls/bulkdump_shorturls.rb",
|
35
|
-
"examples/shorturls/bulkload_shorturls.rb",
|
36
|
-
"examples/shorturls/extract_urls.rb",
|
37
|
-
"examples/shorturls/multiplex_shorturl_cache.rb",
|
38
|
-
"examples/shorturls/old/multidump_and_fix_shorturls.rb",
|
39
|
-
"examples/shorturls/old/shorturl_stats.rb",
|
40
|
-
"examples/shorturls/scrape_shorturls.rb",
|
41
|
-
"examples/shorturls/shorturl_request.rb",
|
42
|
-
"examples/shorturls/shorturl_sequence.rb",
|
43
|
-
"examples/shorturls/shorturl_start_tyrant.sh",
|
44
|
-
"examples/shorturls/start_shorturl_cache.sh",
|
45
|
-
"lib/monkeyshines.rb",
|
46
|
-
"lib/monkeyshines/extensions.rb",
|
47
|
-
"lib/monkeyshines/fetcher.rb",
|
48
|
-
"lib/monkeyshines/fetcher/authed_http_fetcher.rb",
|
49
|
-
"lib/monkeyshines/fetcher/base.rb",
|
50
|
-
"lib/monkeyshines/fetcher/fake_fetcher.rb",
|
51
|
-
"lib/monkeyshines/fetcher/http_fetcher.rb",
|
52
|
-
"lib/monkeyshines/fetcher/http_head_fetcher.rb",
|
53
|
-
"lib/monkeyshines/monitor.rb",
|
54
|
-
"lib/monkeyshines/monitor/chunked_store.rb",
|
55
|
-
"lib/monkeyshines/monitor/periodic_logger.rb",
|
56
|
-
"lib/monkeyshines/monitor/periodic_monitor.rb",
|
57
|
-
"lib/monkeyshines/options.rb",
|
58
|
-
"lib/monkeyshines/recursive_runner.rb",
|
59
|
-
"lib/monkeyshines/repository/base.rb",
|
60
|
-
"lib/monkeyshines/repository/s3.rb",
|
61
|
-
"lib/monkeyshines/request_stream.rb",
|
62
|
-
"lib/monkeyshines/request_stream/base.rb",
|
63
|
-
"lib/monkeyshines/request_stream/edamame_queue.rb",
|
64
|
-
"lib/monkeyshines/request_stream/klass_request_stream.rb",
|
65
|
-
"lib/monkeyshines/request_stream/simple_request_stream.rb",
|
66
|
-
"lib/monkeyshines/runner.rb",
|
67
|
-
"lib/monkeyshines/runner_core/options.rb",
|
68
|
-
"lib/monkeyshines/runner_core/parsing_runner.rb",
|
69
|
-
"lib/monkeyshines/scrape_job/old_paginated.rb",
|
70
|
-
"lib/monkeyshines/scrape_job/recursive.rb",
|
71
|
-
"lib/monkeyshines/scrape_request.rb",
|
72
|
-
"lib/monkeyshines/scrape_request/paginated.rb",
|
73
|
-
"lib/monkeyshines/scrape_request/raw_json_contents.rb",
|
74
|
-
"lib/monkeyshines/scrape_request/signed_url.rb",
|
75
|
-
"lib/monkeyshines/store.rb",
|
76
|
-
"lib/monkeyshines/store/base.rb",
|
77
|
-
"lib/monkeyshines/store/chunked_flat_file_store.rb",
|
78
|
-
"lib/monkeyshines/store/conditional_store.rb",
|
79
|
-
"lib/monkeyshines/store/factory.rb",
|
80
|
-
"lib/monkeyshines/store/flat_file_store.rb",
|
81
|
-
"lib/monkeyshines/store/key_store.rb",
|
82
|
-
"lib/monkeyshines/store/null_store.rb",
|
83
|
-
"lib/monkeyshines/store/read_thru_store.rb",
|
84
|
-
"lib/monkeyshines/store/tokyo_tdb_key_store.rb",
|
85
|
-
"lib/monkeyshines/store/tyrant_rdb_key_store.rb",
|
86
|
-
"lib/monkeyshines/store/tyrant_tdb_key_store.rb",
|
87
|
-
"lib/monkeyshines/utils/factory_module.rb",
|
88
|
-
"lib/monkeyshines/utils/filename_pattern.rb",
|
89
|
-
"lib/monkeyshines/utils/logger.rb",
|
90
|
-
"lib/monkeyshines/utils/trollop-1.14/FAQ.txt",
|
91
|
-
"lib/monkeyshines/utils/trollop-1.14/History.txt",
|
92
|
-
"lib/monkeyshines/utils/trollop-1.14/Manifest.txt",
|
93
|
-
"lib/monkeyshines/utils/trollop-1.14/README.txt",
|
94
|
-
"lib/monkeyshines/utils/trollop-1.14/Rakefile",
|
95
|
-
"lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb",
|
96
|
-
"lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb",
|
97
|
-
"lib/monkeyshines/utils/trollop.rb",
|
98
|
-
"lib/monkeyshines/utils/union_interval.rb",
|
99
|
-
"lib/monkeyshines/utils/uri.rb",
|
100
|
-
"lib/monkeyshines/utils/uuid.rb",
|
101
|
-
"monkeyshines.gemspec",
|
102
|
-
"scrape_from_file.rb",
|
103
|
-
"spec/monkeyshines_spec.rb",
|
104
|
-
"spec/spec_helper.rb"
|
105
|
-
]
|
106
|
-
s.homepage = %q{http://github.com/mrflip/monkeyshines}
|
107
|
-
s.rdoc_options = ["--charset=UTF-8"]
|
108
|
-
s.require_paths = ["lib"]
|
109
|
-
s.rubygems_version = %q{1.3.5}
|
110
|
-
s.summary = %q{A simple scraper for directed scrapes of APIs, feed or structured HTML.}
|
111
|
-
s.test_files = [
|
112
|
-
"spec/monkeyshines_spec.rb",
|
113
|
-
"spec/spec_helper.rb",
|
114
|
-
"examples/bulk_urls/scrape_bulk_urls.rb",
|
115
|
-
"examples/rename_tree/rename_hdp_tree.rb",
|
116
|
-
"examples/rename_tree/rename_ripd_tree.rb",
|
117
|
-
"examples/rss_feeds/scrape_rss_feeds.rb",
|
118
|
-
"examples/shorturls/bulkdump_shorturls.rb",
|
119
|
-
"examples/shorturls/bulkload_shorturls.rb",
|
120
|
-
"examples/shorturls/extract_urls.rb",
|
121
|
-
"examples/shorturls/multiplex_shorturl_cache.rb",
|
122
|
-
"examples/shorturls/old/multidump_and_fix_shorturls.rb",
|
123
|
-
"examples/shorturls/old/shorturl_stats.rb",
|
124
|
-
"examples/shorturls/scrape_shorturls.rb",
|
125
|
-
"examples/shorturls/shorturl_request.rb",
|
126
|
-
"examples/shorturls/shorturl_sequence.rb"
|
127
|
-
]
|
128
|
-
|
129
|
-
if s.respond_to? :specification_version then
|
130
|
-
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
131
|
-
s.specification_version = 3
|
132
|
-
|
133
|
-
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
134
|
-
s.add_runtime_dependency(%q<addressable>, [">= 0"])
|
135
|
-
s.add_runtime_dependency(%q<uuid>, [">= 0"])
|
136
|
-
s.add_runtime_dependency(%q<wukong>, [">= 0"])
|
137
|
-
else
|
138
|
-
s.add_dependency(%q<addressable>, [">= 0"])
|
139
|
-
s.add_dependency(%q<uuid>, [">= 0"])
|
140
|
-
s.add_dependency(%q<wukong>, [">= 0"])
|
141
|
-
end
|
142
|
-
else
|
143
|
-
s.add_dependency(%q<addressable>, [">= 0"])
|
144
|
-
s.add_dependency(%q<uuid>, [">= 0"])
|
145
|
-
s.add_dependency(%q<wukong>, [">= 0"])
|
146
|
-
end
|
147
|
-
end
|
data/scrape_from_file.rb
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
$: << File.dirname(__FILE__)+'/lib'
|
4
|
-
require 'wukong'
|
5
|
-
require 'monkeyshines'
|
6
|
-
require 'monkeyshines/http_fetcher'
|
7
|
-
|
8
|
-
request_filename = ARGV[0]
|
9
|
-
if ! request_filename
|
10
|
-
warn "Please give the name of a file holding URLs to scrape"; exit
|
11
|
-
end
|
12
|
-
dump_filename = "/tmp/req_dump.tsv"
|
13
|
-
|
14
|
-
class SimpleScrapeRequest < Struct.new(
|
15
|
-
:url,
|
16
|
-
:scraped_at, :response_code, :response_message,
|
17
|
-
:contents )
|
18
|
-
end
|
19
|
-
|
20
|
-
class String
|
21
|
-
def to_flat
|
22
|
-
self
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
class Monkeyshines::FlatFileStore
|
27
|
-
attr_accessor :file, :filename
|
28
|
-
def initialize filename
|
29
|
-
self.filename = filename
|
30
|
-
self.file = File.open(filename, "w")
|
31
|
-
end
|
32
|
-
def << contents
|
33
|
-
p contents.to_flat
|
34
|
-
self.file << contents.to_flat.join("\t") + "\n"
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
fetcher = Monkeyshines::HttpFetcher.new('twitter.com')
|
39
|
-
reqs = Monkeyshines::FlatFileRequestStream.new(request_filename, SimpleScrapeRequest)
|
40
|
-
store = Monkeyshines::FlatFileStore.new(dump_filename)
|
41
|
-
reqs.each do |scrape_request|
|
42
|
-
p scrape_request
|
43
|
-
store << fetcher.get(scrape_request)
|
44
|
-
end
|