fizx-csvget 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +45 -13
- data/Rakefile +1 -0
- data/VERSION +1 -1
- data/bin/jsonget +28 -0
- data/csvget.gemspec +63 -0
- data/lib/jsonget.rb +28 -0
- metadata +6 -2
data/README.rdoc
CHANGED
@@ -25,19 +25,51 @@
|
|
25
25
|
}]
|
26
26
|
}
|
27
27
|
> csvget --directory-prefix=./data -A "/x" -w 5 --parselet=hn.let http://news.ycombinator.com/
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
28
|
+
> head data/headlines.csv
|
29
|
+
comments,title,time,link,score,user
|
30
|
+
4,Simpson's paradox: why mistrust seemingly simple statistics,2 hours ago,http://en.wikipedia.org/wiki/Simpson%27s_paradox,41,waldrews
|
31
|
+
67,America's unjust sex laws,2 hours ago,http://www.economist.com/opinion/displaystory.cfm?story_id=14165460,59,MikeCapone
|
32
|
+
23,Buy somebody lunch,3 hours ago,http://www.whattofix.com/blog/archives/2009/08/buy-somebody-lu.php,58,DanielBMarkham
|
33
|
+
10,A design pattern is an artifact of a missing feature in your chosen language,3 hours ago,http://www.snell-pym.org.uk/archives/2008/12/29/design-patterns/,31,bensummers
|
34
|
+
4,API changes in Snow Leopard,1 hour ago,http://developer.apple.com/mac/library/releasenotes/MacOSX/WhatsNewInOSX/Articles/MacOSX10_6.html#//apple_ref/doc/uid/TP40008898-SW1,14,pieter
|
35
|
+
16,How to run a linux based home web server,3 hours ago,http://stevehanov.ca/blog/index.php?id=73,28,RiderOfGiraffes
|
36
|
+
1,"OpenCL ""Hello World""",1 hour ago,"http://developer.apple.com/mac/library/documentation/Performance/Conceptual/OpenCL_MacProgGuide/Example:Hello,World/Example:Hello,World.html",8,pieter
|
37
|
+
15,US Senate bill allows White House to disconnect private computers from Internet,4 hours ago,http://news.cnet.com/8301-13578_3-10320096-38.html,35,drewr
|
38
|
+
1,Strategy: Solve Only 80 Percent of the Problem,47 minutes ago,http://highscalability.com/strategy-solve-only-80-percent-problem,6,alrex021
|
39
|
+
> csvget -h
|
40
|
+
Usage: ./bin/csvget [options] SEED_URL [SEED_URL2 ...]
|
41
|
+
--parselet=JSON_FILE JSON_FILE is a parselet.
|
42
|
+
-w, --wait=SECONDS wait SECONDS between retrievals.
|
43
|
+
-P, --directory-prefix=PREFIX save files to PREFIX/...
|
44
|
+
-U, --user-agent=AGENT identify as AGENT instead of RWget/VERSION.
|
45
|
+
-A, --accept-pattern=RUBY_REGEX URLs must match RUBY_REGEX to be saved to the queue.
|
46
|
+
--time-limit=AMOUNT Crawler will stop after this AMOUNT of time has passed.
|
47
|
+
-R, --reject-pattern=RUBY_REGEX URLs must NOT match RUBY_REGEX to be saved to the queue.
|
48
|
+
--require=RUBY_SCRIPT Will execute 'require RUBY_SCRIPT'
|
49
|
+
--limit-rate=RATE limit download rate to RATE.
|
50
|
+
--http-proxy=URL Proxies via URL
|
51
|
+
--proxy-user=USER Sets proxy user to USER
|
52
|
+
--proxy-password=PASSWORD Sets proxy password to PASSWORD
|
53
|
+
--fetch-class=RUBY_CLASS Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]
|
54
|
+
--store-class=RUBY_CLASS Must implement put(key_string, temp_file)
|
55
|
+
--dupes-class=RUBY_CLASS Must implement dupe?(uri)
|
56
|
+
--queue-class=RUBY_CLASS Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]
|
57
|
+
--links-class=RUBY_CLASS Must implement urls(base_uri, temp_file) #=> [uri, ...]
|
58
|
+
-S, --sitemap=URL URL of a sitemap to crawl (will ignore inter-page links)
|
59
|
+
-V, --version
|
60
|
+
-Q, --quota=NUMBER set retrieval quota to NUMBER.
|
61
|
+
--max-redirect=NUM maximum redirections allowed per page.
|
62
|
+
-H, --span-hosts go to foreign hosts when recursive
|
63
|
+
--connect-timeout=SECS set the connect timeout to SECS.
|
64
|
+
-T, --timeout=SECS set all timeout values to SECONDS.
|
65
|
+
-l, --level=NUMBER maximum recursion depth (inf or 0 for infinite).
|
66
|
+
--[no-]timestampize Prepend the timestamp of when the crawl started to the directory structure.
|
67
|
+
--incremental-from=PREVIOUS Build upon the indexing already saved in PREVIOUS.
|
68
|
+
--protocol-directories use protocol name in directories.
|
69
|
+
--no-host-directories don't create host directories.
|
70
|
+
-v, --[no-]verbose Run verbosely
|
71
|
+
-h, --help Show this message
|
72
|
+
|
41
73
|
== Copyright
|
42
74
|
|
43
75
|
Copyright (c) 2009 Kyle Maxwell. See LICENSE for details (MIT).
|
data/Rakefile
CHANGED
@@ -5,6 +5,7 @@ begin
|
|
5
5
|
require 'jeweler'
|
6
6
|
Jeweler::Tasks.new do |gem|
|
7
7
|
gem.name = "csvget"
|
8
|
+
gem.executables = ["csvget", "jsonget"]
|
8
9
|
gem.summary = %Q{Uses parselets and rwget to generate csv files from websites}
|
9
10
|
gem.description = %Q{Super easy to use (but lots of dependencies :/) parser}
|
10
11
|
gem.email = "kyle@kylemaxwell.com"
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/bin/jsonget
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require "rubygems"
|
3
|
+
require File.dirname(__FILE__) + "/../lib/jsonget"
|
4
|
+
require "rwget"
|
5
|
+
|
6
|
+
parser = RWGetOptionParser.new do |opts|
|
7
|
+
opts.on("--parselet=JSON_FILE", "JSON_FILE is a parselet.") do |path|
|
8
|
+
parser.options[:parselets] ||= []
|
9
|
+
parser.options[:parselets] << path
|
10
|
+
end
|
11
|
+
end
|
12
|
+
parser.parse!
|
13
|
+
|
14
|
+
if parser.options[:seeds].empty?
|
15
|
+
puts parser.usage
|
16
|
+
puts " -h for options listing"
|
17
|
+
exit(1)
|
18
|
+
end
|
19
|
+
|
20
|
+
parser.options[:store_class] ||= "JSONStore"
|
21
|
+
|
22
|
+
controller = RWGet::Controller.new(parser.options)
|
23
|
+
begin
|
24
|
+
controller.start
|
25
|
+
ensure
|
26
|
+
STDERR.puts "Closing..."
|
27
|
+
controller.close
|
28
|
+
end
|
data/csvget.gemspec
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{csvget}
|
8
|
+
s.version = "0.2.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Kyle Maxwell"]
|
12
|
+
s.date = %q{2009-08-28}
|
13
|
+
s.description = %q{Super easy to use (but lots of dependencies :/) parser}
|
14
|
+
s.email = %q{kyle@kylemaxwell.com}
|
15
|
+
s.executables = ["csvget", "jsonget"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE",
|
18
|
+
"README.rdoc"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
".gitignore",
|
22
|
+
"LICENSE",
|
23
|
+
"README.rdoc",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"bin/csvget",
|
27
|
+
"bin/jsonget",
|
28
|
+
"csvget.gemspec",
|
29
|
+
"lib/csvget.rb",
|
30
|
+
"lib/jsonget.rb",
|
31
|
+
"test/csvget_test.rb"
|
32
|
+
]
|
33
|
+
s.homepage = %q{http://github.com/fizx/csvget}
|
34
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
35
|
+
s.require_paths = ["lib"]
|
36
|
+
s.rubygems_version = %q{1.3.5}
|
37
|
+
s.summary = %q{Uses parselets and rwget to generate csv files from websites}
|
38
|
+
s.test_files = [
|
39
|
+
"test/csvget_test.rb"
|
40
|
+
]
|
41
|
+
|
42
|
+
if s.respond_to? :specification_version then
|
43
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
44
|
+
s.specification_version = 3
|
45
|
+
|
46
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
47
|
+
s.add_runtime_dependency(%q<fizx-rwget>, ["> 0.2.3"])
|
48
|
+
s.add_runtime_dependency(%q<fizx-parsley-ruby>, ["> 0.0.0"])
|
49
|
+
s.add_runtime_dependency(%q<activesupport>, ["> 0.0.0"])
|
50
|
+
s.add_runtime_dependency(%q<fastercsv>, [">= 1.4.0"])
|
51
|
+
else
|
52
|
+
s.add_dependency(%q<fizx-rwget>, ["> 0.2.3"])
|
53
|
+
s.add_dependency(%q<fizx-parsley-ruby>, ["> 0.0.0"])
|
54
|
+
s.add_dependency(%q<activesupport>, ["> 0.0.0"])
|
55
|
+
s.add_dependency(%q<fastercsv>, [">= 1.4.0"])
|
56
|
+
end
|
57
|
+
else
|
58
|
+
s.add_dependency(%q<fizx-rwget>, ["> 0.2.3"])
|
59
|
+
s.add_dependency(%q<fizx-parsley-ruby>, ["> 0.0.0"])
|
60
|
+
s.add_dependency(%q<activesupport>, ["> 0.0.0"])
|
61
|
+
s.add_dependency(%q<fastercsv>, [">= 1.4.0"])
|
62
|
+
end
|
63
|
+
end
|
data/lib/jsonget.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "rwget"
|
3
|
+
require "parsley"
|
4
|
+
require "activesupport"
|
5
|
+
require "fileutils"
|
6
|
+
|
7
|
+
class JSONStore
|
8
|
+
def initialize(options = {})
|
9
|
+
@output_folder = options[:prefix] || "."
|
10
|
+
FileUtils.mkdir_p(@output_folder)
|
11
|
+
@parselets = (options[:parselets] || []).map{|path| Parsley.new(File.read(path)) }
|
12
|
+
@files = (options[:parselets] || []).map{|path| File.open("#{File.basename(path)}.json", "a") }
|
13
|
+
end
|
14
|
+
|
15
|
+
def put(host, tmpfile)
|
16
|
+
@parselets.zip(@files).each do |parselet, file|
|
17
|
+
begin
|
18
|
+
file.puts parselet.parse(:file => tmpfile.path, :output => :json) + ","
|
19
|
+
rescue ParsleyError => e
|
20
|
+
STDERR.puts "warning: #{e.message}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def close
|
26
|
+
@files.map(&:close)
|
27
|
+
end
|
28
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fizx-csvget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kyle Maxwell
|
@@ -10,7 +10,7 @@ bindir: bin
|
|
10
10
|
cert_chain: []
|
11
11
|
|
12
12
|
date: 2009-08-28 00:00:00 -07:00
|
13
|
-
default_executable:
|
13
|
+
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: fizx-rwget
|
@@ -56,6 +56,7 @@ description: Super easy to use (but lots of dependencies :/) parser
|
|
56
56
|
email: kyle@kylemaxwell.com
|
57
57
|
executables:
|
58
58
|
- csvget
|
59
|
+
- jsonget
|
59
60
|
extensions: []
|
60
61
|
|
61
62
|
extra_rdoc_files:
|
@@ -68,7 +69,10 @@ files:
|
|
68
69
|
- Rakefile
|
69
70
|
- VERSION
|
70
71
|
- bin/csvget
|
72
|
+
- bin/jsonget
|
73
|
+
- csvget.gemspec
|
71
74
|
- lib/csvget.rb
|
75
|
+
- lib/jsonget.rb
|
72
76
|
- test/csvget_test.rb
|
73
77
|
has_rdoc: false
|
74
78
|
homepage: http://github.com/fizx/csvget
|