monkeyshines 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +4 -0
- data/.gitignore +43 -0
- data/LICENSE +20 -0
- data/LICENSE.textile +20 -0
- data/README.textile +125 -0
- data/Rakefile +105 -0
- data/VERSION +1 -0
- data/examples/.gitignore +4 -0
- data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
- data/examples/rename_tree/rename_hdp_tree.rb +151 -0
- data/examples/rename_tree/rename_ripd_tree.rb +82 -0
- data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
- data/examples/shorturls/README.textile +111 -0
- data/examples/shorturls/bulkdump_shorturls.rb +46 -0
- data/examples/shorturls/bulkload_shorturls.rb +45 -0
- data/examples/shorturls/extract_urls.rb +12 -0
- data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
- data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
- data/examples/shorturls/old/shorturl_stats.rb +81 -0
- data/examples/shorturls/scrape_shorturls.rb +112 -0
- data/examples/shorturls/shorturl_request.rb +29 -0
- data/examples/shorturls/shorturl_sequence.rb +121 -0
- data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
- data/examples/shorturls/start_shorturl_cache.sh +2 -0
- data/lib/monkeyshines.rb +31 -0
- data/lib/monkeyshines/extensions.rb +16 -0
- data/lib/monkeyshines/fetcher.rb +10 -0
- data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
- data/lib/monkeyshines/fetcher/base.rb +44 -0
- data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
- data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
- data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
- data/lib/monkeyshines/monitor.rb +7 -0
- data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
- data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
- data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
- data/lib/monkeyshines/options.rb +59 -0
- data/lib/monkeyshines/recursive_runner.rb +26 -0
- data/lib/monkeyshines/repository/base.rb +57 -0
- data/lib/monkeyshines/repository/s3.rb +169 -0
- data/lib/monkeyshines/request_stream.rb +11 -0
- data/lib/monkeyshines/request_stream/base.rb +32 -0
- data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
- data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
- data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
- data/lib/monkeyshines/runner.rb +161 -0
- data/lib/monkeyshines/runner_core/options.rb +5 -0
- data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
- data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
- data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
- data/lib/monkeyshines/scrape_request.rb +136 -0
- data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
- data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
- data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
- data/lib/monkeyshines/store.rb +14 -0
- data/lib/monkeyshines/store/base.rb +29 -0
- data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
- data/lib/monkeyshines/store/conditional_store.rb +57 -0
- data/lib/monkeyshines/store/factory.rb +8 -0
- data/lib/monkeyshines/store/flat_file_store.rb +84 -0
- data/lib/monkeyshines/store/key_store.rb +51 -0
- data/lib/monkeyshines/store/null_store.rb +15 -0
- data/lib/monkeyshines/store/read_thru_store.rb +22 -0
- data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
- data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
- data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
- data/lib/monkeyshines/utils/factory_module.rb +106 -0
- data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
- data/lib/monkeyshines/utils/logger.rb +15 -0
- data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
- data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
- data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
- data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
- data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
- data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
- data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
- data/lib/monkeyshines/utils/trollop.rb +744 -0
- data/lib/monkeyshines/utils/union_interval.rb +52 -0
- data/lib/monkeyshines/utils/uri.rb +70 -0
- data/lib/monkeyshines/utils/uuid.rb +32 -0
- data/monkeyshines.gemspec +147 -0
- data/scrape_from_file.rb +44 -0
- data/spec/monkeyshines_spec.rb +7 -0
- data/spec/spec_helper.rb +9 -0
- metadata +183 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
#
|
2
|
+
# A numeric interval
|
3
|
+
#
|
4
|
+
# --
|
5
|
+
# could be done with a Range but proved annoying in practice
|
6
|
+
# what with Range's immutability, etc.
|
7
|
+
# ++
|
8
|
+
#
|
9
|
+
class UnionInterval
|
10
|
+
attr_accessor :min, :max
|
11
|
+
# initialize with set min or max values.
|
12
|
+
# To create an interval with no lower bound call:
|
13
|
+
# UnionInterval.new(nil, 69)
|
14
|
+
# Pass nil (or omit) +max+ for no upper bound:
|
15
|
+
# UnionInterval.new(5, nil)
|
16
|
+
def initialize min=nil, max=nil
|
17
|
+
self.min = min
|
18
|
+
self.max = max
|
19
|
+
end
|
20
|
+
# Expand the interval to include all the vals
|
21
|
+
def << vals
|
22
|
+
self.min = [min, vals.to_a].flatten.compact.min
|
23
|
+
self.max = [max, vals.to_a].flatten.compact.max
|
24
|
+
end
|
25
|
+
def + min_max
|
26
|
+
sum_min = [min, min_max.to_a].flatten.compact.min
|
27
|
+
sum_max = [max, min_max.to_a].flatten.compact.max
|
28
|
+
UnionInterval.new sum_min, sum_max
|
29
|
+
end
|
30
|
+
# returns span as an array:
|
31
|
+
# [min, max]
|
32
|
+
def to_a
|
33
|
+
[min, max]
|
34
|
+
end
|
35
|
+
# true if the extent is defined but empty (lower bound exceeds upper bound)
|
36
|
+
def empty?
|
37
|
+
min && max && (min > max)
|
38
|
+
end
|
39
|
+
def include? val
|
40
|
+
val && (!min || (val >= min)) && (!max || (val <= max))
|
41
|
+
end
|
42
|
+
def size
|
43
|
+
return 0 unless max && min
|
44
|
+
max - min
|
45
|
+
end
|
46
|
+
# string conversion:
|
47
|
+
# #<span:7..956734>
|
48
|
+
def to_s
|
49
|
+
"#<span:#{min}..#{max}>"
|
50
|
+
end
|
51
|
+
def inspect() to_s end
|
52
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
module Addressable
|
3
|
+
#
|
4
|
+
# Add the #scrubbed and #revhost calls
|
5
|
+
#
|
6
|
+
class URI
|
7
|
+
#
|
8
|
+
# These are illegal but *are* found in URLs. We're going to let them through.
|
9
|
+
# Note that ' ' space is one of the tolerated miscreants.
|
10
|
+
#
|
11
|
+
URL_ILLEGAL_BUT_WHATEVER_DOOD_CHARS = '\{\}\| \^\`'
|
12
|
+
#
|
13
|
+
# These are all the characters that belong in a URL
|
14
|
+
#
|
15
|
+
PERMISSIVE_SCRUB_CHARS =
|
16
|
+
URL_ILLEGAL_BUT_WHATEVER_DOOD_CHARS +
|
17
|
+
Addressable::URI::CharacterClasses::UNRESERVED +
|
18
|
+
Addressable::URI::CharacterClasses::RESERVED + '%'
|
19
|
+
|
20
|
+
#
|
21
|
+
# Replace all url-insane characters by their %encoding. We don't really
|
22
|
+
# care here whether the URLs do anything: we just want to remove stuff that
|
23
|
+
# absosmurfly don't belong.
|
24
|
+
#
|
25
|
+
# This code is stolen from Addressable::URI, which unfortunately has a bug
|
26
|
+
# in exactly this method (fixed here). (http://addressable.rubyforge.org)
|
27
|
+
# Note that we are /not/ re-encoding characters like '%' -- it's assumed
|
28
|
+
# that the url is encoded, but perhaps poorly.
|
29
|
+
#
|
30
|
+
# In practice the illegal characters most often seen are those in
|
31
|
+
# RE_URL_ILLEGAL_BUT_WHATEVER_DOOD_CHARS plus
|
32
|
+
# <>"\t\\
|
33
|
+
#
|
34
|
+
def self.scrub_url url
|
35
|
+
return url if url.blank?
|
36
|
+
url.gsub(/[^#{PERMISSIVE_SCRUB_CHARS}]+/) do |sequence|
|
37
|
+
sequence.unpack('C*').map{ |c| ("%%%02x"%c).upcase }.join("")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# +revhost+
|
43
|
+
# the dot-reversed host:
|
44
|
+
# foo.company.com => com.company.foo
|
45
|
+
#
|
46
|
+
def revhost
|
47
|
+
return host unless host =~ /\./
|
48
|
+
host.split('.').reverse.join('.')
|
49
|
+
end
|
50
|
+
|
51
|
+
#
|
52
|
+
# The md5hash of this URI
|
53
|
+
#
|
54
|
+
# make sure to require 'digest/md5' somewhere...
|
55
|
+
def md5hash
|
56
|
+
Digest::MD5.hexdigest(self.normalize.to_s)
|
57
|
+
end
|
58
|
+
|
59
|
+
#
|
60
|
+
# +uuid+ -- RFC-4122 ver.5 uuid; guaranteed to be universally unique
|
61
|
+
#
|
62
|
+
# See http://www.faqs.org/rfcs/rfc4122.html
|
63
|
+
#
|
64
|
+
# You ned to require "monkeyshines/utils/uuid" as well...
|
65
|
+
#
|
66
|
+
def url_uuid
|
67
|
+
UUID.sha1_create(UUID_URL_NAMESPACE, self.normalize.to_s)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'uuidtools'
|
2
|
+
class UUID
|
3
|
+
|
4
|
+
#
|
5
|
+
# A string suitable for using as a path name --
|
6
|
+
#
|
7
|
+
# Ex.
|
8
|
+
# urn:uuid:3c0dce44-80a8-11dd-a897-001ff35a0a8b =>
|
9
|
+
# urn_uuid/3c0dce44/80a8/11dd/a897/001ff35a0a8b
|
10
|
+
#
|
11
|
+
# It's well possible there are more perspicacious choices for points to split
|
12
|
+
# the string, but until we hit that limit this'll do.
|
13
|
+
#
|
14
|
+
def to_path
|
15
|
+
'urn_uuid/' + to_s.gsub(/[\:\-]/,'/')
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.hex_to_str str
|
19
|
+
/([\da-f]{8})([\da-f]{4})([\da-f]{4})([\da-f]{4})([\da-f]{12})/.match(str).captures.join '-'
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
def self.parse_hex str
|
24
|
+
parse(UUID.hex_to_str(str))
|
25
|
+
end
|
26
|
+
|
27
|
+
# Overrides UUIDTools -- force 32 hex digits (leading zeros)
|
28
|
+
def hexdigest
|
29
|
+
"%032x" % self.to_i
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,147 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{monkeyshines}
|
8
|
+
s.version = "0.0.2"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Philip (flip) Kromer"]
|
12
|
+
s.date = %q{2009-10-12}
|
13
|
+
s.description = %q{A simple scraper for directed scrapes of APIs, feed or structured HTML. Plays nicely with wuclan and wukong.}
|
14
|
+
s.email = %q{flip@infochimps.org}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"LICENSE.textile",
|
18
|
+
"README.textile"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".document",
|
22
|
+
".gitignore",
|
23
|
+
"LICENSE",
|
24
|
+
"LICENSE.textile",
|
25
|
+
"README.textile",
|
26
|
+
"Rakefile",
|
27
|
+
"VERSION",
|
28
|
+
"examples/.gitignore",
|
29
|
+
"examples/bulk_urls/scrape_bulk_urls.rb",
|
30
|
+
"examples/rename_tree/rename_hdp_tree.rb",
|
31
|
+
"examples/rename_tree/rename_ripd_tree.rb",
|
32
|
+
"examples/rss_feeds/scrape_rss_feeds.rb",
|
33
|
+
"examples/shorturls/README.textile",
|
34
|
+
"examples/shorturls/bulkdump_shorturls.rb",
|
35
|
+
"examples/shorturls/bulkload_shorturls.rb",
|
36
|
+
"examples/shorturls/extract_urls.rb",
|
37
|
+
"examples/shorturls/multiplex_shorturl_cache.rb",
|
38
|
+
"examples/shorturls/old/multidump_and_fix_shorturls.rb",
|
39
|
+
"examples/shorturls/old/shorturl_stats.rb",
|
40
|
+
"examples/shorturls/scrape_shorturls.rb",
|
41
|
+
"examples/shorturls/shorturl_request.rb",
|
42
|
+
"examples/shorturls/shorturl_sequence.rb",
|
43
|
+
"examples/shorturls/shorturl_start_tyrant.sh",
|
44
|
+
"examples/shorturls/start_shorturl_cache.sh",
|
45
|
+
"lib/monkeyshines.rb",
|
46
|
+
"lib/monkeyshines/extensions.rb",
|
47
|
+
"lib/monkeyshines/fetcher.rb",
|
48
|
+
"lib/monkeyshines/fetcher/authed_http_fetcher.rb",
|
49
|
+
"lib/monkeyshines/fetcher/base.rb",
|
50
|
+
"lib/monkeyshines/fetcher/fake_fetcher.rb",
|
51
|
+
"lib/monkeyshines/fetcher/http_fetcher.rb",
|
52
|
+
"lib/monkeyshines/fetcher/http_head_fetcher.rb",
|
53
|
+
"lib/monkeyshines/monitor.rb",
|
54
|
+
"lib/monkeyshines/monitor/chunked_store.rb",
|
55
|
+
"lib/monkeyshines/monitor/periodic_logger.rb",
|
56
|
+
"lib/monkeyshines/monitor/periodic_monitor.rb",
|
57
|
+
"lib/monkeyshines/options.rb",
|
58
|
+
"lib/monkeyshines/recursive_runner.rb",
|
59
|
+
"lib/monkeyshines/repository/base.rb",
|
60
|
+
"lib/monkeyshines/repository/s3.rb",
|
61
|
+
"lib/monkeyshines/request_stream.rb",
|
62
|
+
"lib/monkeyshines/request_stream/base.rb",
|
63
|
+
"lib/monkeyshines/request_stream/edamame_queue.rb",
|
64
|
+
"lib/monkeyshines/request_stream/klass_request_stream.rb",
|
65
|
+
"lib/monkeyshines/request_stream/simple_request_stream.rb",
|
66
|
+
"lib/monkeyshines/runner.rb",
|
67
|
+
"lib/monkeyshines/runner_core/options.rb",
|
68
|
+
"lib/monkeyshines/runner_core/parsing_runner.rb",
|
69
|
+
"lib/monkeyshines/scrape_job/old_paginated.rb",
|
70
|
+
"lib/monkeyshines/scrape_job/recursive.rb",
|
71
|
+
"lib/monkeyshines/scrape_request.rb",
|
72
|
+
"lib/monkeyshines/scrape_request/paginated.rb",
|
73
|
+
"lib/monkeyshines/scrape_request/raw_json_contents.rb",
|
74
|
+
"lib/monkeyshines/scrape_request/signed_url.rb",
|
75
|
+
"lib/monkeyshines/store.rb",
|
76
|
+
"lib/monkeyshines/store/base.rb",
|
77
|
+
"lib/monkeyshines/store/chunked_flat_file_store.rb",
|
78
|
+
"lib/monkeyshines/store/conditional_store.rb",
|
79
|
+
"lib/monkeyshines/store/factory.rb",
|
80
|
+
"lib/monkeyshines/store/flat_file_store.rb",
|
81
|
+
"lib/monkeyshines/store/key_store.rb",
|
82
|
+
"lib/monkeyshines/store/null_store.rb",
|
83
|
+
"lib/monkeyshines/store/read_thru_store.rb",
|
84
|
+
"lib/monkeyshines/store/tokyo_tdb_key_store.rb",
|
85
|
+
"lib/monkeyshines/store/tyrant_rdb_key_store.rb",
|
86
|
+
"lib/monkeyshines/store/tyrant_tdb_key_store.rb",
|
87
|
+
"lib/monkeyshines/utils/factory_module.rb",
|
88
|
+
"lib/monkeyshines/utils/filename_pattern.rb",
|
89
|
+
"lib/monkeyshines/utils/logger.rb",
|
90
|
+
"lib/monkeyshines/utils/trollop-1.14/FAQ.txt",
|
91
|
+
"lib/monkeyshines/utils/trollop-1.14/History.txt",
|
92
|
+
"lib/monkeyshines/utils/trollop-1.14/Manifest.txt",
|
93
|
+
"lib/monkeyshines/utils/trollop-1.14/README.txt",
|
94
|
+
"lib/monkeyshines/utils/trollop-1.14/Rakefile",
|
95
|
+
"lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb",
|
96
|
+
"lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb",
|
97
|
+
"lib/monkeyshines/utils/trollop.rb",
|
98
|
+
"lib/monkeyshines/utils/union_interval.rb",
|
99
|
+
"lib/monkeyshines/utils/uri.rb",
|
100
|
+
"lib/monkeyshines/utils/uuid.rb",
|
101
|
+
"monkeyshines.gemspec",
|
102
|
+
"scrape_from_file.rb",
|
103
|
+
"spec/monkeyshines_spec.rb",
|
104
|
+
"spec/spec_helper.rb"
|
105
|
+
]
|
106
|
+
s.homepage = %q{http://github.com/mrflip/monkeyshines}
|
107
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
108
|
+
s.require_paths = ["lib"]
|
109
|
+
s.rubygems_version = %q{1.3.5}
|
110
|
+
s.summary = %q{A simple scraper for directed scrapes of APIs, feed or structured HTML.}
|
111
|
+
s.test_files = [
|
112
|
+
"spec/monkeyshines_spec.rb",
|
113
|
+
"spec/spec_helper.rb",
|
114
|
+
"examples/bulk_urls/scrape_bulk_urls.rb",
|
115
|
+
"examples/rename_tree/rename_hdp_tree.rb",
|
116
|
+
"examples/rename_tree/rename_ripd_tree.rb",
|
117
|
+
"examples/rss_feeds/scrape_rss_feeds.rb",
|
118
|
+
"examples/shorturls/bulkdump_shorturls.rb",
|
119
|
+
"examples/shorturls/bulkload_shorturls.rb",
|
120
|
+
"examples/shorturls/extract_urls.rb",
|
121
|
+
"examples/shorturls/multiplex_shorturl_cache.rb",
|
122
|
+
"examples/shorturls/old/multidump_and_fix_shorturls.rb",
|
123
|
+
"examples/shorturls/old/shorturl_stats.rb",
|
124
|
+
"examples/shorturls/scrape_shorturls.rb",
|
125
|
+
"examples/shorturls/shorturl_request.rb",
|
126
|
+
"examples/shorturls/shorturl_sequence.rb"
|
127
|
+
]
|
128
|
+
|
129
|
+
if s.respond_to? :specification_version then
|
130
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
131
|
+
s.specification_version = 3
|
132
|
+
|
133
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
134
|
+
s.add_runtime_dependency(%q<addressable>, [">= 0"])
|
135
|
+
s.add_runtime_dependency(%q<uuid>, [">= 0"])
|
136
|
+
s.add_runtime_dependency(%q<wukong>, [">= 0"])
|
137
|
+
else
|
138
|
+
s.add_dependency(%q<addressable>, [">= 0"])
|
139
|
+
s.add_dependency(%q<uuid>, [">= 0"])
|
140
|
+
s.add_dependency(%q<wukong>, [">= 0"])
|
141
|
+
end
|
142
|
+
else
|
143
|
+
s.add_dependency(%q<addressable>, [">= 0"])
|
144
|
+
s.add_dependency(%q<uuid>, [">= 0"])
|
145
|
+
s.add_dependency(%q<wukong>, [">= 0"])
|
146
|
+
end
|
147
|
+
end
|
data/scrape_from_file.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
$: << File.dirname(__FILE__)+'/lib'
|
4
|
+
require 'wukong'
|
5
|
+
require 'monkeyshines'
|
6
|
+
require 'monkeyshines/http_fetcher'
|
7
|
+
|
8
|
+
request_filename = ARGV[0]
|
9
|
+
if ! request_filename
|
10
|
+
warn "Please give the name of a file holding URLs to scrape"; exit
|
11
|
+
end
|
12
|
+
dump_filename = "/tmp/req_dump.tsv"
|
13
|
+
|
14
|
+
class SimpleScrapeRequest < Struct.new(
|
15
|
+
:url,
|
16
|
+
:scraped_at, :response_code, :response_message,
|
17
|
+
:contents )
|
18
|
+
end
|
19
|
+
|
20
|
+
class String
|
21
|
+
def to_flat
|
22
|
+
self
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class Monkeyshines::FlatFileStore
|
27
|
+
attr_accessor :file, :filename
|
28
|
+
def initialize filename
|
29
|
+
self.filename = filename
|
30
|
+
self.file = File.open(filename, "w")
|
31
|
+
end
|
32
|
+
def << contents
|
33
|
+
p contents.to_flat
|
34
|
+
self.file << contents.to_flat.join("\t") + "\n"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
fetcher = Monkeyshines::HttpFetcher.new('twitter.com')
|
39
|
+
reqs = Monkeyshines::FlatFileRequestStream.new(request_filename, SimpleScrapeRequest)
|
40
|
+
store = Monkeyshines::FlatFileStore.new(dump_filename)
|
41
|
+
reqs.each do |scrape_request|
|
42
|
+
p scrape_request
|
43
|
+
store << fetcher.get(scrape_request)
|
44
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: monkeyshines
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Philip (flip) Kromer
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-10-12 00:00:00 -05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: addressable
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: uuid
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: wukong
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
description: A simple scraper for directed scrapes of APIs, feed or structured HTML. Plays nicely with wuclan and wukong.
|
46
|
+
email: flip@infochimps.org
|
47
|
+
executables: []
|
48
|
+
|
49
|
+
extensions: []
|
50
|
+
|
51
|
+
extra_rdoc_files:
|
52
|
+
- LICENSE
|
53
|
+
- LICENSE.textile
|
54
|
+
- README.textile
|
55
|
+
files:
|
56
|
+
- .document
|
57
|
+
- .gitignore
|
58
|
+
- LICENSE
|
59
|
+
- LICENSE.textile
|
60
|
+
- README.textile
|
61
|
+
- Rakefile
|
62
|
+
- VERSION
|
63
|
+
- examples/.gitignore
|
64
|
+
- examples/bulk_urls/scrape_bulk_urls.rb
|
65
|
+
- examples/rename_tree/rename_hdp_tree.rb
|
66
|
+
- examples/rename_tree/rename_ripd_tree.rb
|
67
|
+
- examples/rss_feeds/scrape_rss_feeds.rb
|
68
|
+
- examples/shorturls/README.textile
|
69
|
+
- examples/shorturls/bulkdump_shorturls.rb
|
70
|
+
- examples/shorturls/bulkload_shorturls.rb
|
71
|
+
- examples/shorturls/extract_urls.rb
|
72
|
+
- examples/shorturls/multiplex_shorturl_cache.rb
|
73
|
+
- examples/shorturls/old/multidump_and_fix_shorturls.rb
|
74
|
+
- examples/shorturls/old/shorturl_stats.rb
|
75
|
+
- examples/shorturls/scrape_shorturls.rb
|
76
|
+
- examples/shorturls/shorturl_request.rb
|
77
|
+
- examples/shorturls/shorturl_sequence.rb
|
78
|
+
- examples/shorturls/shorturl_start_tyrant.sh
|
79
|
+
- examples/shorturls/start_shorturl_cache.sh
|
80
|
+
- lib/monkeyshines.rb
|
81
|
+
- lib/monkeyshines/extensions.rb
|
82
|
+
- lib/monkeyshines/fetcher.rb
|
83
|
+
- lib/monkeyshines/fetcher/authed_http_fetcher.rb
|
84
|
+
- lib/monkeyshines/fetcher/base.rb
|
85
|
+
- lib/monkeyshines/fetcher/fake_fetcher.rb
|
86
|
+
- lib/monkeyshines/fetcher/http_fetcher.rb
|
87
|
+
- lib/monkeyshines/fetcher/http_head_fetcher.rb
|
88
|
+
- lib/monkeyshines/monitor.rb
|
89
|
+
- lib/monkeyshines/monitor/chunked_store.rb
|
90
|
+
- lib/monkeyshines/monitor/periodic_logger.rb
|
91
|
+
- lib/monkeyshines/monitor/periodic_monitor.rb
|
92
|
+
- lib/monkeyshines/options.rb
|
93
|
+
- lib/monkeyshines/recursive_runner.rb
|
94
|
+
- lib/monkeyshines/repository/base.rb
|
95
|
+
- lib/monkeyshines/repository/s3.rb
|
96
|
+
- lib/monkeyshines/request_stream.rb
|
97
|
+
- lib/monkeyshines/request_stream/base.rb
|
98
|
+
- lib/monkeyshines/request_stream/edamame_queue.rb
|
99
|
+
- lib/monkeyshines/request_stream/klass_request_stream.rb
|
100
|
+
- lib/monkeyshines/request_stream/simple_request_stream.rb
|
101
|
+
- lib/monkeyshines/runner.rb
|
102
|
+
- lib/monkeyshines/runner_core/options.rb
|
103
|
+
- lib/monkeyshines/runner_core/parsing_runner.rb
|
104
|
+
- lib/monkeyshines/scrape_job/old_paginated.rb
|
105
|
+
- lib/monkeyshines/scrape_job/recursive.rb
|
106
|
+
- lib/monkeyshines/scrape_request.rb
|
107
|
+
- lib/monkeyshines/scrape_request/paginated.rb
|
108
|
+
- lib/monkeyshines/scrape_request/raw_json_contents.rb
|
109
|
+
- lib/monkeyshines/scrape_request/signed_url.rb
|
110
|
+
- lib/monkeyshines/store.rb
|
111
|
+
- lib/monkeyshines/store/base.rb
|
112
|
+
- lib/monkeyshines/store/chunked_flat_file_store.rb
|
113
|
+
- lib/monkeyshines/store/conditional_store.rb
|
114
|
+
- lib/monkeyshines/store/factory.rb
|
115
|
+
- lib/monkeyshines/store/flat_file_store.rb
|
116
|
+
- lib/monkeyshines/store/key_store.rb
|
117
|
+
- lib/monkeyshines/store/null_store.rb
|
118
|
+
- lib/monkeyshines/store/read_thru_store.rb
|
119
|
+
- lib/monkeyshines/store/tokyo_tdb_key_store.rb
|
120
|
+
- lib/monkeyshines/store/tyrant_rdb_key_store.rb
|
121
|
+
- lib/monkeyshines/store/tyrant_tdb_key_store.rb
|
122
|
+
- lib/monkeyshines/utils/factory_module.rb
|
123
|
+
- lib/monkeyshines/utils/filename_pattern.rb
|
124
|
+
- lib/monkeyshines/utils/logger.rb
|
125
|
+
- lib/monkeyshines/utils/trollop-1.14/FAQ.txt
|
126
|
+
- lib/monkeyshines/utils/trollop-1.14/History.txt
|
127
|
+
- lib/monkeyshines/utils/trollop-1.14/Manifest.txt
|
128
|
+
- lib/monkeyshines/utils/trollop-1.14/README.txt
|
129
|
+
- lib/monkeyshines/utils/trollop-1.14/Rakefile
|
130
|
+
- lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb
|
131
|
+
- lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb
|
132
|
+
- lib/monkeyshines/utils/trollop.rb
|
133
|
+
- lib/monkeyshines/utils/union_interval.rb
|
134
|
+
- lib/monkeyshines/utils/uri.rb
|
135
|
+
- lib/monkeyshines/utils/uuid.rb
|
136
|
+
- monkeyshines.gemspec
|
137
|
+
- scrape_from_file.rb
|
138
|
+
- spec/monkeyshines_spec.rb
|
139
|
+
- spec/spec_helper.rb
|
140
|
+
has_rdoc: true
|
141
|
+
homepage: http://github.com/mrflip/monkeyshines
|
142
|
+
licenses: []
|
143
|
+
|
144
|
+
post_install_message:
|
145
|
+
rdoc_options:
|
146
|
+
- --charset=UTF-8
|
147
|
+
require_paths:
|
148
|
+
- lib
|
149
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
150
|
+
requirements:
|
151
|
+
- - ">="
|
152
|
+
- !ruby/object:Gem::Version
|
153
|
+
version: "0"
|
154
|
+
version:
|
155
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - ">="
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: "0"
|
160
|
+
version:
|
161
|
+
requirements: []
|
162
|
+
|
163
|
+
rubyforge_project:
|
164
|
+
rubygems_version: 1.3.5
|
165
|
+
signing_key:
|
166
|
+
specification_version: 3
|
167
|
+
summary: A simple scraper for directed scrapes of APIs, feed or structured HTML.
|
168
|
+
test_files:
|
169
|
+
- spec/monkeyshines_spec.rb
|
170
|
+
- spec/spec_helper.rb
|
171
|
+
- examples/bulk_urls/scrape_bulk_urls.rb
|
172
|
+
- examples/rename_tree/rename_hdp_tree.rb
|
173
|
+
- examples/rename_tree/rename_ripd_tree.rb
|
174
|
+
- examples/rss_feeds/scrape_rss_feeds.rb
|
175
|
+
- examples/shorturls/bulkdump_shorturls.rb
|
176
|
+
- examples/shorturls/bulkload_shorturls.rb
|
177
|
+
- examples/shorturls/extract_urls.rb
|
178
|
+
- examples/shorturls/multiplex_shorturl_cache.rb
|
179
|
+
- examples/shorturls/old/multidump_and_fix_shorturls.rb
|
180
|
+
- examples/shorturls/old/shorturl_stats.rb
|
181
|
+
- examples/shorturls/scrape_shorturls.rb
|
182
|
+
- examples/shorturls/shorturl_request.rb
|
183
|
+
- examples/shorturls/shorturl_sequence.rb
|