monkeyshines 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +4 -0
- data/.gitignore +43 -0
- data/LICENSE +20 -0
- data/LICENSE.textile +20 -0
- data/README.textile +125 -0
- data/Rakefile +105 -0
- data/VERSION +1 -0
- data/examples/.gitignore +4 -0
- data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
- data/examples/rename_tree/rename_hdp_tree.rb +151 -0
- data/examples/rename_tree/rename_ripd_tree.rb +82 -0
- data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
- data/examples/shorturls/README.textile +111 -0
- data/examples/shorturls/bulkdump_shorturls.rb +46 -0
- data/examples/shorturls/bulkload_shorturls.rb +45 -0
- data/examples/shorturls/extract_urls.rb +12 -0
- data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
- data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
- data/examples/shorturls/old/shorturl_stats.rb +81 -0
- data/examples/shorturls/scrape_shorturls.rb +112 -0
- data/examples/shorturls/shorturl_request.rb +29 -0
- data/examples/shorturls/shorturl_sequence.rb +121 -0
- data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
- data/examples/shorturls/start_shorturl_cache.sh +2 -0
- data/lib/monkeyshines.rb +31 -0
- data/lib/monkeyshines/extensions.rb +16 -0
- data/lib/monkeyshines/fetcher.rb +10 -0
- data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
- data/lib/monkeyshines/fetcher/base.rb +44 -0
- data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
- data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
- data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
- data/lib/monkeyshines/monitor.rb +7 -0
- data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
- data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
- data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
- data/lib/monkeyshines/options.rb +59 -0
- data/lib/monkeyshines/recursive_runner.rb +26 -0
- data/lib/monkeyshines/repository/base.rb +57 -0
- data/lib/monkeyshines/repository/s3.rb +169 -0
- data/lib/monkeyshines/request_stream.rb +11 -0
- data/lib/monkeyshines/request_stream/base.rb +32 -0
- data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
- data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
- data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
- data/lib/monkeyshines/runner.rb +161 -0
- data/lib/monkeyshines/runner_core/options.rb +5 -0
- data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
- data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
- data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
- data/lib/monkeyshines/scrape_request.rb +136 -0
- data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
- data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
- data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
- data/lib/monkeyshines/store.rb +14 -0
- data/lib/monkeyshines/store/base.rb +29 -0
- data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
- data/lib/monkeyshines/store/conditional_store.rb +57 -0
- data/lib/monkeyshines/store/factory.rb +8 -0
- data/lib/monkeyshines/store/flat_file_store.rb +84 -0
- data/lib/monkeyshines/store/key_store.rb +51 -0
- data/lib/monkeyshines/store/null_store.rb +15 -0
- data/lib/monkeyshines/store/read_thru_store.rb +22 -0
- data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
- data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
- data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
- data/lib/monkeyshines/utils/factory_module.rb +106 -0
- data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
- data/lib/monkeyshines/utils/logger.rb +15 -0
- data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
- data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
- data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
- data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
- data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
- data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
- data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
- data/lib/monkeyshines/utils/trollop.rb +744 -0
- data/lib/monkeyshines/utils/union_interval.rb +52 -0
- data/lib/monkeyshines/utils/uri.rb +70 -0
- data/lib/monkeyshines/utils/uuid.rb +32 -0
- data/monkeyshines.gemspec +147 -0
- data/scrape_from_file.rb +44 -0
- data/spec/monkeyshines_spec.rb +7 -0
- data/spec/spec_helper.rb +9 -0
- metadata +183 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
#
|
2
|
+
# A numeric interval
|
3
|
+
#
|
4
|
+
# --
|
5
|
+
# could be done with a Range but proved annoying in practice
|
6
|
+
# what with Range's immutability, etc.
|
7
|
+
# ++
|
8
|
+
#
|
9
|
+
class UnionInterval
|
10
|
+
attr_accessor :min, :max
|
11
|
+
# initialize with set min or max values.
|
12
|
+
# To create an interval with no lower bound call:
|
13
|
+
# UnionInterval.new(nil, 69)
|
14
|
+
# Pass nil (or omit) +max+ for no upper bound:
|
15
|
+
# UnionInterval.new(5, nil)
|
16
|
+
def initialize min=nil, max=nil
|
17
|
+
self.min = min
|
18
|
+
self.max = max
|
19
|
+
end
|
20
|
+
# Expand the interval to include all the vals
|
21
|
+
def << vals
|
22
|
+
self.min = [min, vals.to_a].flatten.compact.min
|
23
|
+
self.max = [max, vals.to_a].flatten.compact.max
|
24
|
+
end
|
25
|
+
def + min_max
|
26
|
+
sum_min = [min, min_max.to_a].flatten.compact.min
|
27
|
+
sum_max = [max, min_max.to_a].flatten.compact.max
|
28
|
+
UnionInterval.new sum_min, sum_max
|
29
|
+
end
|
30
|
+
# returns span as an array:
|
31
|
+
# [min, max]
|
32
|
+
def to_a
|
33
|
+
[min, max]
|
34
|
+
end
|
35
|
+
# true if the extent is defined but empty (lower bound exceeds upper bound)
|
36
|
+
def empty?
|
37
|
+
min && max && (min > max)
|
38
|
+
end
|
39
|
+
def include? val
|
40
|
+
val && (!min || (val >= min)) && (!max || (val <= max))
|
41
|
+
end
|
42
|
+
def size
|
43
|
+
return 0 unless max && min
|
44
|
+
max - min
|
45
|
+
end
|
46
|
+
# string conversion:
|
47
|
+
# #<span:7..956734>
|
48
|
+
def to_s
|
49
|
+
"#<span:#{min}..#{max}>"
|
50
|
+
end
|
51
|
+
def inspect() to_s end
|
52
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'addressable/uri'
|
2
|
+
module Addressable
|
3
|
+
#
|
4
|
+
# Add the #scrubbed and #revhost calls
|
5
|
+
#
|
6
|
+
class URI
|
7
|
+
#
|
8
|
+
# These are illegal but *are* found in URLs. We're going to let them through.
|
9
|
+
# Note that ' ' space is one of the tolerated miscreants.
|
10
|
+
#
|
11
|
+
URL_ILLEGAL_BUT_WHATEVER_DOOD_CHARS = '\{\}\| \^\`'
|
12
|
+
#
|
13
|
+
# These are all the characters that belong in a URL
|
14
|
+
#
|
15
|
+
PERMISSIVE_SCRUB_CHARS =
|
16
|
+
URL_ILLEGAL_BUT_WHATEVER_DOOD_CHARS +
|
17
|
+
Addressable::URI::CharacterClasses::UNRESERVED +
|
18
|
+
Addressable::URI::CharacterClasses::RESERVED + '%'
|
19
|
+
|
20
|
+
#
|
21
|
+
# Replace all url-insane characters by their %encoding. We don't really
|
22
|
+
# care here whether the URLs do anything: we just want to remove stuff that
|
23
|
+
# absosmurfly don't belong.
|
24
|
+
#
|
25
|
+
# This code is stolen from Addressable::URI, which unfortunately has a bug
|
26
|
+
# in exactly this method (fixed here). (http://addressable.rubyforge.org)
|
27
|
+
# Note that we are /not/ re-encoding characters like '%' -- it's assumed
|
28
|
+
# that the url is encoded, but perhaps poorly.
|
29
|
+
#
|
30
|
+
# In practice the illegal characters most often seen are those in
|
31
|
+
# RE_URL_ILLEGAL_BUT_WHATEVER_DOOD_CHARS plus
|
32
|
+
# <>"\t\\
|
33
|
+
#
|
34
|
+
def self.scrub_url url
|
35
|
+
return url if url.blank?
|
36
|
+
url.gsub(/[^#{PERMISSIVE_SCRUB_CHARS}]+/) do |sequence|
|
37
|
+
sequence.unpack('C*').map{ |c| ("%%%02x"%c).upcase }.join("")
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# +revhost+
|
43
|
+
# the dot-reversed host:
|
44
|
+
# foo.company.com => com.company.foo
|
45
|
+
#
|
46
|
+
def revhost
|
47
|
+
return host unless host =~ /\./
|
48
|
+
host.split('.').reverse.join('.')
|
49
|
+
end
|
50
|
+
|
51
|
+
#
|
52
|
+
# The md5hash of this URI
|
53
|
+
#
|
54
|
+
# make sure to require 'digest/md5' somewhere...
|
55
|
+
def md5hash
|
56
|
+
Digest::MD5.hexdigest(self.normalize.to_s)
|
57
|
+
end
|
58
|
+
|
59
|
+
#
|
60
|
+
# +uuid+ -- RFC-4122 ver.5 uuid; guaranteed to be universally unique
|
61
|
+
#
|
62
|
+
# See http://www.faqs.org/rfcs/rfc4122.html
|
63
|
+
#
|
64
|
+
# You ned to require "monkeyshines/utils/uuid" as well...
|
65
|
+
#
|
66
|
+
def url_uuid
|
67
|
+
UUID.sha1_create(UUID_URL_NAMESPACE, self.normalize.to_s)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'uuidtools'
|
2
|
+
class UUID
|
3
|
+
|
4
|
+
#
|
5
|
+
# A string suitable for using as a path name --
|
6
|
+
#
|
7
|
+
# Ex.
|
8
|
+
# urn:uuid:3c0dce44-80a8-11dd-a897-001ff35a0a8b =>
|
9
|
+
# urn_uuid/3c0dce44/80a8/11dd/a897/001ff35a0a8b
|
10
|
+
#
|
11
|
+
# It's well possible there are more perspicacious choices for points to split
|
12
|
+
# the string, but until we hit that limit this'll do.
|
13
|
+
#
|
14
|
+
def to_path
|
15
|
+
'urn_uuid/' + to_s.gsub(/[\:\-]/,'/')
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.hex_to_str str
|
19
|
+
/([\da-f]{8})([\da-f]{4})([\da-f]{4})([\da-f]{4})([\da-f]{12})/.match(str).captures.join '-'
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
def self.parse_hex str
|
24
|
+
parse(UUID.hex_to_str(str))
|
25
|
+
end
|
26
|
+
|
27
|
+
# Overrides UUIDTools -- force 32 hex digits (leading zeros)
|
28
|
+
def hexdigest
|
29
|
+
"%032x" % self.to_i
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -0,0 +1,147 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{monkeyshines}
|
8
|
+
s.version = "0.0.2"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Philip (flip) Kromer"]
|
12
|
+
s.date = %q{2009-10-12}
|
13
|
+
s.description = %q{A simple scraper for directed scrapes of APIs, feed or structured HTML. Plays nicely with wuclan and wukong.}
|
14
|
+
s.email = %q{flip@infochimps.org}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"LICENSE.textile",
|
18
|
+
"README.textile"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".document",
|
22
|
+
".gitignore",
|
23
|
+
"LICENSE",
|
24
|
+
"LICENSE.textile",
|
25
|
+
"README.textile",
|
26
|
+
"Rakefile",
|
27
|
+
"VERSION",
|
28
|
+
"examples/.gitignore",
|
29
|
+
"examples/bulk_urls/scrape_bulk_urls.rb",
|
30
|
+
"examples/rename_tree/rename_hdp_tree.rb",
|
31
|
+
"examples/rename_tree/rename_ripd_tree.rb",
|
32
|
+
"examples/rss_feeds/scrape_rss_feeds.rb",
|
33
|
+
"examples/shorturls/README.textile",
|
34
|
+
"examples/shorturls/bulkdump_shorturls.rb",
|
35
|
+
"examples/shorturls/bulkload_shorturls.rb",
|
36
|
+
"examples/shorturls/extract_urls.rb",
|
37
|
+
"examples/shorturls/multiplex_shorturl_cache.rb",
|
38
|
+
"examples/shorturls/old/multidump_and_fix_shorturls.rb",
|
39
|
+
"examples/shorturls/old/shorturl_stats.rb",
|
40
|
+
"examples/shorturls/scrape_shorturls.rb",
|
41
|
+
"examples/shorturls/shorturl_request.rb",
|
42
|
+
"examples/shorturls/shorturl_sequence.rb",
|
43
|
+
"examples/shorturls/shorturl_start_tyrant.sh",
|
44
|
+
"examples/shorturls/start_shorturl_cache.sh",
|
45
|
+
"lib/monkeyshines.rb",
|
46
|
+
"lib/monkeyshines/extensions.rb",
|
47
|
+
"lib/monkeyshines/fetcher.rb",
|
48
|
+
"lib/monkeyshines/fetcher/authed_http_fetcher.rb",
|
49
|
+
"lib/monkeyshines/fetcher/base.rb",
|
50
|
+
"lib/monkeyshines/fetcher/fake_fetcher.rb",
|
51
|
+
"lib/monkeyshines/fetcher/http_fetcher.rb",
|
52
|
+
"lib/monkeyshines/fetcher/http_head_fetcher.rb",
|
53
|
+
"lib/monkeyshines/monitor.rb",
|
54
|
+
"lib/monkeyshines/monitor/chunked_store.rb",
|
55
|
+
"lib/monkeyshines/monitor/periodic_logger.rb",
|
56
|
+
"lib/monkeyshines/monitor/periodic_monitor.rb",
|
57
|
+
"lib/monkeyshines/options.rb",
|
58
|
+
"lib/monkeyshines/recursive_runner.rb",
|
59
|
+
"lib/monkeyshines/repository/base.rb",
|
60
|
+
"lib/monkeyshines/repository/s3.rb",
|
61
|
+
"lib/monkeyshines/request_stream.rb",
|
62
|
+
"lib/monkeyshines/request_stream/base.rb",
|
63
|
+
"lib/monkeyshines/request_stream/edamame_queue.rb",
|
64
|
+
"lib/monkeyshines/request_stream/klass_request_stream.rb",
|
65
|
+
"lib/monkeyshines/request_stream/simple_request_stream.rb",
|
66
|
+
"lib/monkeyshines/runner.rb",
|
67
|
+
"lib/monkeyshines/runner_core/options.rb",
|
68
|
+
"lib/monkeyshines/runner_core/parsing_runner.rb",
|
69
|
+
"lib/monkeyshines/scrape_job/old_paginated.rb",
|
70
|
+
"lib/monkeyshines/scrape_job/recursive.rb",
|
71
|
+
"lib/monkeyshines/scrape_request.rb",
|
72
|
+
"lib/monkeyshines/scrape_request/paginated.rb",
|
73
|
+
"lib/monkeyshines/scrape_request/raw_json_contents.rb",
|
74
|
+
"lib/monkeyshines/scrape_request/signed_url.rb",
|
75
|
+
"lib/monkeyshines/store.rb",
|
76
|
+
"lib/monkeyshines/store/base.rb",
|
77
|
+
"lib/monkeyshines/store/chunked_flat_file_store.rb",
|
78
|
+
"lib/monkeyshines/store/conditional_store.rb",
|
79
|
+
"lib/monkeyshines/store/factory.rb",
|
80
|
+
"lib/monkeyshines/store/flat_file_store.rb",
|
81
|
+
"lib/monkeyshines/store/key_store.rb",
|
82
|
+
"lib/monkeyshines/store/null_store.rb",
|
83
|
+
"lib/monkeyshines/store/read_thru_store.rb",
|
84
|
+
"lib/monkeyshines/store/tokyo_tdb_key_store.rb",
|
85
|
+
"lib/monkeyshines/store/tyrant_rdb_key_store.rb",
|
86
|
+
"lib/monkeyshines/store/tyrant_tdb_key_store.rb",
|
87
|
+
"lib/monkeyshines/utils/factory_module.rb",
|
88
|
+
"lib/monkeyshines/utils/filename_pattern.rb",
|
89
|
+
"lib/monkeyshines/utils/logger.rb",
|
90
|
+
"lib/monkeyshines/utils/trollop-1.14/FAQ.txt",
|
91
|
+
"lib/monkeyshines/utils/trollop-1.14/History.txt",
|
92
|
+
"lib/monkeyshines/utils/trollop-1.14/Manifest.txt",
|
93
|
+
"lib/monkeyshines/utils/trollop-1.14/README.txt",
|
94
|
+
"lib/monkeyshines/utils/trollop-1.14/Rakefile",
|
95
|
+
"lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb",
|
96
|
+
"lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb",
|
97
|
+
"lib/monkeyshines/utils/trollop.rb",
|
98
|
+
"lib/monkeyshines/utils/union_interval.rb",
|
99
|
+
"lib/monkeyshines/utils/uri.rb",
|
100
|
+
"lib/monkeyshines/utils/uuid.rb",
|
101
|
+
"monkeyshines.gemspec",
|
102
|
+
"scrape_from_file.rb",
|
103
|
+
"spec/monkeyshines_spec.rb",
|
104
|
+
"spec/spec_helper.rb"
|
105
|
+
]
|
106
|
+
s.homepage = %q{http://github.com/mrflip/monkeyshines}
|
107
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
108
|
+
s.require_paths = ["lib"]
|
109
|
+
s.rubygems_version = %q{1.3.5}
|
110
|
+
s.summary = %q{A simple scraper for directed scrapes of APIs, feed or structured HTML.}
|
111
|
+
s.test_files = [
|
112
|
+
"spec/monkeyshines_spec.rb",
|
113
|
+
"spec/spec_helper.rb",
|
114
|
+
"examples/bulk_urls/scrape_bulk_urls.rb",
|
115
|
+
"examples/rename_tree/rename_hdp_tree.rb",
|
116
|
+
"examples/rename_tree/rename_ripd_tree.rb",
|
117
|
+
"examples/rss_feeds/scrape_rss_feeds.rb",
|
118
|
+
"examples/shorturls/bulkdump_shorturls.rb",
|
119
|
+
"examples/shorturls/bulkload_shorturls.rb",
|
120
|
+
"examples/shorturls/extract_urls.rb",
|
121
|
+
"examples/shorturls/multiplex_shorturl_cache.rb",
|
122
|
+
"examples/shorturls/old/multidump_and_fix_shorturls.rb",
|
123
|
+
"examples/shorturls/old/shorturl_stats.rb",
|
124
|
+
"examples/shorturls/scrape_shorturls.rb",
|
125
|
+
"examples/shorturls/shorturl_request.rb",
|
126
|
+
"examples/shorturls/shorturl_sequence.rb"
|
127
|
+
]
|
128
|
+
|
129
|
+
if s.respond_to? :specification_version then
|
130
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
131
|
+
s.specification_version = 3
|
132
|
+
|
133
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
134
|
+
s.add_runtime_dependency(%q<addressable>, [">= 0"])
|
135
|
+
s.add_runtime_dependency(%q<uuid>, [">= 0"])
|
136
|
+
s.add_runtime_dependency(%q<wukong>, [">= 0"])
|
137
|
+
else
|
138
|
+
s.add_dependency(%q<addressable>, [">= 0"])
|
139
|
+
s.add_dependency(%q<uuid>, [">= 0"])
|
140
|
+
s.add_dependency(%q<wukong>, [">= 0"])
|
141
|
+
end
|
142
|
+
else
|
143
|
+
s.add_dependency(%q<addressable>, [">= 0"])
|
144
|
+
s.add_dependency(%q<uuid>, [">= 0"])
|
145
|
+
s.add_dependency(%q<wukong>, [">= 0"])
|
146
|
+
end
|
147
|
+
end
|
data/scrape_from_file.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
$: << File.dirname(__FILE__)+'/lib'
|
4
|
+
require 'wukong'
|
5
|
+
require 'monkeyshines'
|
6
|
+
require 'monkeyshines/http_fetcher'
|
7
|
+
|
8
|
+
request_filename = ARGV[0]
|
9
|
+
if ! request_filename
|
10
|
+
warn "Please give the name of a file holding URLs to scrape"; exit
|
11
|
+
end
|
12
|
+
dump_filename = "/tmp/req_dump.tsv"
|
13
|
+
|
14
|
+
class SimpleScrapeRequest < Struct.new(
|
15
|
+
:url,
|
16
|
+
:scraped_at, :response_code, :response_message,
|
17
|
+
:contents )
|
18
|
+
end
|
19
|
+
|
20
|
+
class String
|
21
|
+
def to_flat
|
22
|
+
self
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class Monkeyshines::FlatFileStore
|
27
|
+
attr_accessor :file, :filename
|
28
|
+
def initialize filename
|
29
|
+
self.filename = filename
|
30
|
+
self.file = File.open(filename, "w")
|
31
|
+
end
|
32
|
+
def << contents
|
33
|
+
p contents.to_flat
|
34
|
+
self.file << contents.to_flat.join("\t") + "\n"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
fetcher = Monkeyshines::HttpFetcher.new('twitter.com')
|
39
|
+
reqs = Monkeyshines::FlatFileRequestStream.new(request_filename, SimpleScrapeRequest)
|
40
|
+
store = Monkeyshines::FlatFileStore.new(dump_filename)
|
41
|
+
reqs.each do |scrape_request|
|
42
|
+
p scrape_request
|
43
|
+
store << fetcher.get(scrape_request)
|
44
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,183 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: monkeyshines
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Philip (flip) Kromer
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-10-12 00:00:00 -05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: addressable
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: uuid
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: "0"
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: wukong
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: "0"
|
44
|
+
version:
|
45
|
+
description: A simple scraper for directed scrapes of APIs, feed or structured HTML. Plays nicely with wuclan and wukong.
|
46
|
+
email: flip@infochimps.org
|
47
|
+
executables: []
|
48
|
+
|
49
|
+
extensions: []
|
50
|
+
|
51
|
+
extra_rdoc_files:
|
52
|
+
- LICENSE
|
53
|
+
- LICENSE.textile
|
54
|
+
- README.textile
|
55
|
+
files:
|
56
|
+
- .document
|
57
|
+
- .gitignore
|
58
|
+
- LICENSE
|
59
|
+
- LICENSE.textile
|
60
|
+
- README.textile
|
61
|
+
- Rakefile
|
62
|
+
- VERSION
|
63
|
+
- examples/.gitignore
|
64
|
+
- examples/bulk_urls/scrape_bulk_urls.rb
|
65
|
+
- examples/rename_tree/rename_hdp_tree.rb
|
66
|
+
- examples/rename_tree/rename_ripd_tree.rb
|
67
|
+
- examples/rss_feeds/scrape_rss_feeds.rb
|
68
|
+
- examples/shorturls/README.textile
|
69
|
+
- examples/shorturls/bulkdump_shorturls.rb
|
70
|
+
- examples/shorturls/bulkload_shorturls.rb
|
71
|
+
- examples/shorturls/extract_urls.rb
|
72
|
+
- examples/shorturls/multiplex_shorturl_cache.rb
|
73
|
+
- examples/shorturls/old/multidump_and_fix_shorturls.rb
|
74
|
+
- examples/shorturls/old/shorturl_stats.rb
|
75
|
+
- examples/shorturls/scrape_shorturls.rb
|
76
|
+
- examples/shorturls/shorturl_request.rb
|
77
|
+
- examples/shorturls/shorturl_sequence.rb
|
78
|
+
- examples/shorturls/shorturl_start_tyrant.sh
|
79
|
+
- examples/shorturls/start_shorturl_cache.sh
|
80
|
+
- lib/monkeyshines.rb
|
81
|
+
- lib/monkeyshines/extensions.rb
|
82
|
+
- lib/monkeyshines/fetcher.rb
|
83
|
+
- lib/monkeyshines/fetcher/authed_http_fetcher.rb
|
84
|
+
- lib/monkeyshines/fetcher/base.rb
|
85
|
+
- lib/monkeyshines/fetcher/fake_fetcher.rb
|
86
|
+
- lib/monkeyshines/fetcher/http_fetcher.rb
|
87
|
+
- lib/monkeyshines/fetcher/http_head_fetcher.rb
|
88
|
+
- lib/monkeyshines/monitor.rb
|
89
|
+
- lib/monkeyshines/monitor/chunked_store.rb
|
90
|
+
- lib/monkeyshines/monitor/periodic_logger.rb
|
91
|
+
- lib/monkeyshines/monitor/periodic_monitor.rb
|
92
|
+
- lib/monkeyshines/options.rb
|
93
|
+
- lib/monkeyshines/recursive_runner.rb
|
94
|
+
- lib/monkeyshines/repository/base.rb
|
95
|
+
- lib/monkeyshines/repository/s3.rb
|
96
|
+
- lib/monkeyshines/request_stream.rb
|
97
|
+
- lib/monkeyshines/request_stream/base.rb
|
98
|
+
- lib/monkeyshines/request_stream/edamame_queue.rb
|
99
|
+
- lib/monkeyshines/request_stream/klass_request_stream.rb
|
100
|
+
- lib/monkeyshines/request_stream/simple_request_stream.rb
|
101
|
+
- lib/monkeyshines/runner.rb
|
102
|
+
- lib/monkeyshines/runner_core/options.rb
|
103
|
+
- lib/monkeyshines/runner_core/parsing_runner.rb
|
104
|
+
- lib/monkeyshines/scrape_job/old_paginated.rb
|
105
|
+
- lib/monkeyshines/scrape_job/recursive.rb
|
106
|
+
- lib/monkeyshines/scrape_request.rb
|
107
|
+
- lib/monkeyshines/scrape_request/paginated.rb
|
108
|
+
- lib/monkeyshines/scrape_request/raw_json_contents.rb
|
109
|
+
- lib/monkeyshines/scrape_request/signed_url.rb
|
110
|
+
- lib/monkeyshines/store.rb
|
111
|
+
- lib/monkeyshines/store/base.rb
|
112
|
+
- lib/monkeyshines/store/chunked_flat_file_store.rb
|
113
|
+
- lib/monkeyshines/store/conditional_store.rb
|
114
|
+
- lib/monkeyshines/store/factory.rb
|
115
|
+
- lib/monkeyshines/store/flat_file_store.rb
|
116
|
+
- lib/monkeyshines/store/key_store.rb
|
117
|
+
- lib/monkeyshines/store/null_store.rb
|
118
|
+
- lib/monkeyshines/store/read_thru_store.rb
|
119
|
+
- lib/monkeyshines/store/tokyo_tdb_key_store.rb
|
120
|
+
- lib/monkeyshines/store/tyrant_rdb_key_store.rb
|
121
|
+
- lib/monkeyshines/store/tyrant_tdb_key_store.rb
|
122
|
+
- lib/monkeyshines/utils/factory_module.rb
|
123
|
+
- lib/monkeyshines/utils/filename_pattern.rb
|
124
|
+
- lib/monkeyshines/utils/logger.rb
|
125
|
+
- lib/monkeyshines/utils/trollop-1.14/FAQ.txt
|
126
|
+
- lib/monkeyshines/utils/trollop-1.14/History.txt
|
127
|
+
- lib/monkeyshines/utils/trollop-1.14/Manifest.txt
|
128
|
+
- lib/monkeyshines/utils/trollop-1.14/README.txt
|
129
|
+
- lib/monkeyshines/utils/trollop-1.14/Rakefile
|
130
|
+
- lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb
|
131
|
+
- lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb
|
132
|
+
- lib/monkeyshines/utils/trollop.rb
|
133
|
+
- lib/monkeyshines/utils/union_interval.rb
|
134
|
+
- lib/monkeyshines/utils/uri.rb
|
135
|
+
- lib/monkeyshines/utils/uuid.rb
|
136
|
+
- monkeyshines.gemspec
|
137
|
+
- scrape_from_file.rb
|
138
|
+
- spec/monkeyshines_spec.rb
|
139
|
+
- spec/spec_helper.rb
|
140
|
+
has_rdoc: true
|
141
|
+
homepage: http://github.com/mrflip/monkeyshines
|
142
|
+
licenses: []
|
143
|
+
|
144
|
+
post_install_message:
|
145
|
+
rdoc_options:
|
146
|
+
- --charset=UTF-8
|
147
|
+
require_paths:
|
148
|
+
- lib
|
149
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
150
|
+
requirements:
|
151
|
+
- - ">="
|
152
|
+
- !ruby/object:Gem::Version
|
153
|
+
version: "0"
|
154
|
+
version:
|
155
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - ">="
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: "0"
|
160
|
+
version:
|
161
|
+
requirements: []
|
162
|
+
|
163
|
+
rubyforge_project:
|
164
|
+
rubygems_version: 1.3.5
|
165
|
+
signing_key:
|
166
|
+
specification_version: 3
|
167
|
+
summary: A simple scraper for directed scrapes of APIs, feed or structured HTML.
|
168
|
+
test_files:
|
169
|
+
- spec/monkeyshines_spec.rb
|
170
|
+
- spec/spec_helper.rb
|
171
|
+
- examples/bulk_urls/scrape_bulk_urls.rb
|
172
|
+
- examples/rename_tree/rename_hdp_tree.rb
|
173
|
+
- examples/rename_tree/rename_ripd_tree.rb
|
174
|
+
- examples/rss_feeds/scrape_rss_feeds.rb
|
175
|
+
- examples/shorturls/bulkdump_shorturls.rb
|
176
|
+
- examples/shorturls/bulkload_shorturls.rb
|
177
|
+
- examples/shorturls/extract_urls.rb
|
178
|
+
- examples/shorturls/multiplex_shorturl_cache.rb
|
179
|
+
- examples/shorturls/old/multidump_and_fix_shorturls.rb
|
180
|
+
- examples/shorturls/old/shorturl_stats.rb
|
181
|
+
- examples/shorturls/scrape_shorturls.rb
|
182
|
+
- examples/shorturls/shorturl_request.rb
|
183
|
+
- examples/shorturls/shorturl_sequence.rb
|