csvget 0.0.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2 @@
1
+ *.gem
2
+ *.csv
@@ -0,0 +1,8 @@
1
+ 0.3.2 - Sept 10, 2009
2
+ -- Don't write headers when appending to an existing CSV file.
3
+ 0.3.0 - Sept 2, 2009
4
+ -- Added command-line option:
5
+
6
+ --filter=RUBY_CODE RUBY_CODE will be eval'd in context of @row.is_a?(FasterCSV::Row)
7
+
8
+ -- Added CHANGELOG
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Kyle Maxwell
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,93 @@
1
+ = csvget (also, jsonget)
2
+
3
+ == What's new in 0.3.0
4
+
5
+ 1. Added command-line option:
6
+
7
+ --filter=RUBY_CODE RUBY_CODE will be eval'd in context of @row.is_a?(FasterCSV::Row)
8
+
9
+ # Example:
10
+ ./bin/csvget --require=chronic --require=time --filter "@row['time']=Chronic.parse(@row['time']).iso8601" # ...
11
+
12
+ 2. Added CHANGELOG
13
+
14
+ == Dependencies
15
+
16
+ - http://github.com/fizx/parsley/tree/master and its dependencies.
17
+ - Rubygems
18
+
19
+ == Running on EC2
20
+
21
+ > git clone git://github.com/fizx/csvget-ec2-recipe.git
22
+ > cd csvget-ec2-recipe
23
+ > ./boot.rb
24
+ > ssh YOUR_INSTANCE
25
+
26
+ == Local Installation
27
+
28
+ 1. Install the dependencies.
29
+ 2. > gem sources -a http://gems.github.com
30
+ 3. > sudo gem install fizx-csvget
31
+
32
+ == Example Usage
33
+
34
+ > cat hn.let
35
+ {
36
+ "headlines":[{
37
+ "title": ".title a",
38
+ "link": ".title a @href",
39
+ "comments": "match(.subtext a:nth-child(3), '\\d+')",
40
+ "user": ".subtext a:nth-child(2)",
41
+ "score": "match(.subtext span, '\\d+')",
42
+ "time": "match(.subtext, '\\d+\\s+\\w+\\s+ago')"
43
+ }]
44
+ }
45
+ > csvget --directory-prefix=./data -A "/x" -w 5 --parselet=hn.let http://news.ycombinator.com/
46
+ > head data/headlines.csv
47
+ comments,title,time,link,score,user
48
+ 4,Simpson's paradox: why mistrust seemingly simple statistics,2 hours ago,http://en.wikipedia.org/wiki/Simpson%27s_paradox,41,waldrews
49
+ 67,America's unjust sex laws,2 hours ago,http://www.economist.com/opinion/displaystory.cfm?story_id=14165460,59,MikeCapone
50
+ 23,Buy somebody lunch,3 hours ago,http://www.whattofix.com/blog/archives/2009/08/buy-somebody-lu.php,58,DanielBMarkham
51
+ 10,A design pattern is an artifact of a missing feature in your chosen language,3 hours ago,http://www.snell-pym.org.uk/archives/2008/12/29/design-patterns/,31,bensummers
52
+ 4,API changes in Snow Leopard,1 hour ago,http://developer.apple.com/mac/library/releasenotes/MacOSX/WhatsNewInOSX/Articles/MacOSX10_6.html#//apple_ref/doc/uid/TP40008898-SW1,14,pieter
53
+ 16,How to run a linux based home web server,3 hours ago,http://stevehanov.ca/blog/index.php?id=73,28,RiderOfGiraffes
54
+ 1,"OpenCL ""Hello World""",1 hour ago,"http://developer.apple.com/mac/library/documentation/Performance/Conceptual/OpenCL_MacProgGuide/Example:Hello,World/Example:Hello,World.html",8,pieter
55
+ 15,US Senate bill allows White House to disconnect private computers from Internet,4 hours ago,http://news.cnet.com/8301-13578_3-10320096-38.html,35,drewr
56
+ 1,Strategy: Solve Only 80 Percent of the Problem,47 minutes ago,http://highscalability.com/strategy-solve-only-80-percent-problem,6,alrex021
57
+ > csvget -h
58
+ Usage: ./bin/csvget [options] SEED_URL [SEED_URL2 ...]
59
+ --parselet=JSON_FILE JSON_FILE is a parselet.
60
+ -w, --wait=SECONDS wait SECONDS between retrievals.
61
+ -P, --directory-prefix=PREFIX save files to PREFIX/...
62
+ -U, --user-agent=AGENT identify as AGENT instead of RWget/VERSION.
63
+ -A, --accept-pattern=RUBY_REGEX URLs must match RUBY_REGEX to be saved to the queue.
64
+ --time-limit=AMOUNT Crawler will stop after this AMOUNT of time has passed.
65
+ -R, --reject-pattern=RUBY_REGEX URLs must NOT match RUBY_REGEX to be saved to the queue.
66
+ --require=RUBY_SCRIPT Will execute 'require RUBY_SCRIPT'
67
+ --limit-rate=RATE limit download rate to RATE.
68
+ --http-proxy=URL Proxies via URL
69
+ --proxy-user=USER Sets proxy user to USER
70
+ --proxy-password=PASSWORD Sets proxy password to PASSWORD
71
+ --fetch-class=RUBY_CLASS Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]
72
+ --store-class=RUBY_CLASS Must implement put(key_string, temp_file)
73
+ --dupes-class=RUBY_CLASS Must implement dupe?(uri)
74
+ --queue-class=RUBY_CLASS Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]
75
+ --links-class=RUBY_CLASS Must implement urls(base_uri, temp_file) #=> [uri, ...]
76
+ -S, --sitemap=URL URL of a sitemap to crawl (will ignore inter-page links)
77
+ -V, --version
78
+ -Q, --quota=NUMBER set retrieval quota to NUMBER.
79
+ --max-redirect=NUM maximum redirections allowed per page.
80
+ -H, --span-hosts go to foreign hosts when recursive
81
+ --connect-timeout=SECS set the connect timeout to SECS.
82
+ -T, --timeout=SECS set all timeout values to SECONDS.
83
+ -l, --level=NUMBER maximum recursion depth (inf or 0 for infinite).
84
+ --[no-]timestampize Prepend the timestamp of when the crawl started to the directory structure.
85
+ --incremental-from=PREVIOUS Build upon the indexing already saved in PREVIOUS.
86
+ --protocol-directories use protocol name in directories.
87
+ --no-host-directories don't create host directories.
88
+ -v, --[no-]verbose Run verbosely
89
+ -h, --help Show this message
90
+
91
+ == Copyright
92
+
93
+ Copyright (c) 2009 Kyle Maxwell. See LICENSE for details (MIT).
@@ -0,0 +1,62 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "csvget"
8
+ gem.executables = ["csvget", "jsonget"]
9
+ gem.summary = %Q{Uses parselets and rwget to generate csv files from websites}
10
+ gem.description = %Q{Super easy to use (but lots of dependencies :/) parser}
11
+ gem.email = "kyle@kylemaxwell.com"
12
+ gem.homepage = "http://github.com/fizx/csvget"
13
+ gem.authors = ["Kyle Maxwell"]
14
+ gem.add_dependency("fizx-rwget", ["> 0.2.3"])
15
+ gem.add_dependency("fizx-parsley-ruby", ["> 0.0.0"])
16
+ gem.add_dependency("activesupport", ["> 0.0.0"])
17
+ gem.add_dependency("fastercsv", [">= 1.4.0"])
18
+
19
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
20
+ end
21
+ Jeweler::GemcutterTasks.new
22
+ rescue LoadError
23
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
24
+ end
25
+
26
+ require 'rake/testtask'
27
+ Rake::TestTask.new(:test) do |test|
28
+ test.libs << 'lib' << 'test'
29
+ test.pattern = 'test/**/*_test.rb'
30
+ test.verbose = true
31
+ end
32
+
33
+ begin
34
+ require 'rcov/rcovtask'
35
+ Rcov::RcovTask.new do |test|
36
+ test.libs << 'test'
37
+ test.pattern = 'test/**/*_test.rb'
38
+ test.verbose = true
39
+ end
40
+ rescue LoadError
41
+ task :rcov do
42
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
43
+ end
44
+ end
45
+
46
+ task :test => :check_dependencies
47
+
48
+ task :default => :test
49
+
50
+ require 'rake/rdoctask'
51
+ Rake::RDocTask.new do |rdoc|
52
+ if File.exist?('VERSION')
53
+ version = File.read('VERSION')
54
+ else
55
+ version = ""
56
+ end
57
+
58
+ rdoc.rdoc_dir = 'rdoc'
59
+ rdoc.title = "csvget #{version}"
60
+ rdoc.rdoc_files.include('README*')
61
+ rdoc.rdoc_files.include('lib/**/*.rb')
62
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.4.0
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+ require "rubygems"
3
+ require File.dirname(__FILE__) + "/../lib/csvget"
4
+ require "rwget"
5
+
6
+ parser = RWGetOptionParser.new do |opts|
7
+ opts.on("--parselet=JSON_FILE", "JSON_FILE is a parselet.") do |path|
8
+ parser.options[:parselets] ||= []
9
+ parser.options[:parselets] << path
10
+ end
11
+
12
+ opts.on("--filter=RUBY_CODE", "RUBY_CODE will be eval'd in context of @row.is_a?(FasterCSV::Row)") do |filter|
13
+ parser.options[:filter] ||= []
14
+ parser.options[:filter] << filter
15
+ end
16
+
17
+ end
18
+ parser.parse!
19
+
20
+ if parser.options[:seeds].empty?
21
+ puts parser.usage
22
+ puts " -h for options listing"
23
+ exit(1)
24
+ end
25
+
26
+ parser.options[:store_class] ||= "CSVStore"
27
+
28
+ controller = RWGet::Controller.new(parser.options)
29
+ begin
30
+ controller.start
31
+ ensure
32
+ STDERR.puts "Closing..."
33
+ controller.close
34
+ end
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+ require "rubygems"
3
+ require File.dirname(__FILE__) + "/../lib/jsonget"
4
+ require "rwget"
5
+
6
+ parser = RWGetOptionParser.new do |opts|
7
+ opts.on("--parselet=JSON_FILE", "JSON_FILE is a parselet.") do |path|
8
+ parser.options[:parselets] ||= []
9
+ parser.options[:parselets] << path
10
+ end
11
+ end
12
+ parser.parse!
13
+
14
+ if parser.options[:seeds].empty?
15
+ puts parser.usage
16
+ puts " -h for options listing"
17
+ exit(1)
18
+ end
19
+
20
+ parser.options[:store_class] ||= "JSONStore"
21
+
22
+ controller = RWGet::Controller.new(parser.options)
23
+ begin
24
+ controller.start
25
+ ensure
26
+ STDERR.puts "Closing..."
27
+ controller.close
28
+ end
@@ -0,0 +1,65 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{csvget}
8
+ s.version = "0.4.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Kyle Maxwell"]
12
+ s.date = %q{2009-10-16}
13
+ s.description = %q{Super easy to use (but lots of dependencies :/) parser}
14
+ s.email = %q{kyle@kylemaxwell.com}
15
+ s.executables = ["csvget", "jsonget"]
16
+ s.extra_rdoc_files = [
17
+ "LICENSE",
18
+ "README.rdoc"
19
+ ]
20
+ s.files = [
21
+ ".gitignore",
22
+ "CHANGELOG",
23
+ "LICENSE",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "bin/csvget",
28
+ "bin/jsonget",
29
+ "csvget.gemspec",
30
+ "hn.let",
31
+ "lib/csvget.rb",
32
+ "lib/jsonget.rb",
33
+ "test/csvget_test.rb"
34
+ ]
35
+ s.homepage = %q{http://github.com/fizx/csvget}
36
+ s.rdoc_options = ["--charset=UTF-8"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = %q{1.3.4}
39
+ s.summary = %q{Uses parselets and rwget to generate csv files from websites}
40
+ s.test_files = [
41
+ "test/csvget_test.rb"
42
+ ]
43
+
44
+ if s.respond_to? :specification_version then
45
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
46
+ s.specification_version = 3
47
+
48
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
49
+ s.add_runtime_dependency(%q<fizx-rwget>, ["> 0.2.3"])
50
+ s.add_runtime_dependency(%q<fizx-parsley-ruby>, ["> 0.0.0"])
51
+ s.add_runtime_dependency(%q<activesupport>, ["> 0.0.0"])
52
+ s.add_runtime_dependency(%q<fastercsv>, [">= 1.4.0"])
53
+ else
54
+ s.add_dependency(%q<fizx-rwget>, ["> 0.2.3"])
55
+ s.add_dependency(%q<fizx-parsley-ruby>, ["> 0.0.0"])
56
+ s.add_dependency(%q<activesupport>, ["> 0.0.0"])
57
+ s.add_dependency(%q<fastercsv>, [">= 1.4.0"])
58
+ end
59
+ else
60
+ s.add_dependency(%q<fizx-rwget>, ["> 0.2.3"])
61
+ s.add_dependency(%q<fizx-parsley-ruby>, ["> 0.0.0"])
62
+ s.add_dependency(%q<activesupport>, ["> 0.0.0"])
63
+ s.add_dependency(%q<fastercsv>, [">= 1.4.0"])
64
+ end
65
+ end
data/hn.let ADDED
@@ -0,0 +1,10 @@
1
+ {
2
+ "headlines":[{
3
+ "title": ".title a",
4
+ "link": ".title a @href",
5
+ "comments": "match(.subtext a:nth-child(3), '\\d+')",
6
+ "user": ".subtext a:nth-child(2)",
7
+ "score": "match(.subtext span, '\\d+')",
8
+ "time": "match(.subtext, '\\d+\\s+\\w+\\s+ago')"
9
+ }]
10
+ }
@@ -0,0 +1,54 @@
1
+ require "rubygems"
2
+ require "rwget"
3
+ require "parsley"
4
+ require "fastercsv"
5
+ require "activesupport"
6
+ require "fileutils"
7
+
8
+ class CSVStore
9
+ def initialize(options = {})
10
+ @output_folder = options[:prefix] || "."
11
+ @filters = options[:filter] || []
12
+ FileUtils.mkdir_p(@output_folder)
13
+ @parselets = (options[:parselets] || []).map{|path| Parsley.new(File.read(path)) }
14
+ @files = {}
15
+ @headers = {}
16
+ end
17
+
18
+ def put(host, tmpfile)
19
+ @parselets.each do |parselet|
20
+ begin
21
+ type = (`file "#{tmpfile.path}"` =~ /xml/i) ? :xml : :html
22
+ output = parselet.parse(:file => tmpfile.path, :input => type)
23
+ walk(output)
24
+ rescue ParsleyError => e
25
+ STDERR.puts "warning: #{e.message}"
26
+ end
27
+ end
28
+ end
29
+
30
+ def walk(data, prefix = nil)
31
+ data.each do |prefix, values|
32
+ values = [values] unless values.is_a?(Array)
33
+ file_name = File.join(@output_folder, "#{prefix}.csv")
34
+ h = @headers[prefix] ||= values.first.keys
35
+ should_write_headers = !File.exists?(file_name)
36
+ f = @files[prefix] ||= FasterCSV.open(file_name, "a", :headers => h, :write_headers => should_write_headers)
37
+
38
+ values.each do |hash|
39
+ arr = h.inject([]) do |memo, key|
40
+ memo << hash[key]
41
+ end
42
+ @row = FasterCSV::Row.new(h, arr)
43
+ @filters.each {|filter| eval(filter) }
44
+ f << @row
45
+ end
46
+ end
47
+ end
48
+
49
+ def close
50
+ @files.each do |k, v|
51
+ v.close
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,29 @@
1
+ require "rubygems"
2
+ require "rwget"
3
+ require "parsley"
4
+ require "activesupport"
5
+ require "fileutils"
6
+
7
+ class JSONStore
8
+ def initialize(options = {})
9
+ @output_folder = options[:prefix] || "."
10
+ FileUtils.mkdir_p(@output_folder)
11
+ @parselets = (options[:parselets] || []).map{|path| Parsley.new(File.read(path)) }
12
+ @files = (options[:parselets] || []).map{|path| File.open("#{File.basename(path)}.json", "a") }
13
+ end
14
+
15
+ def put(host, tmpfile)
16
+ @parselets.zip(@files).each do |parselet, file|
17
+ begin
18
+ type = (`file "#{tmpfile.path}"` =~ /xml/i) ? :xml : :html
19
+ output = parselet.parse(:file => tmpfile.path, :input => type, :output => :json) + ","
20
+ rescue ParsleyError => e
21
+ STDERR.puts "warning: #{e.message}"
22
+ end
23
+ end
24
+ end
25
+
26
+ def close
27
+ @files.map(&:close)
28
+ end
29
+ end
@@ -0,0 +1,20 @@
1
+ require "test/unit"
2
+ require File.dirname(__FILE__) + "/../lib/csvget"
3
+ require "fileutils"
4
+
5
+ class CsvgetTest < Test::Unit::TestCase
6
+ include FileUtils
7
+
8
+ # def setup
9
+ # @output = {"bill-state"=>"Welcome to Google Business Solutions", "links"=>["/", "https://adwords.google.com/select/Login?sourceid=awo&subid=us-en-et-bizsol-0-biz1-all&medium=link&hl=en_US"]}
10
+ # @links = ParseletLinks.new(:parselets => File.dirname(__FILE__) + "/foo.let")
11
+ # end
12
+ #
13
+ # def test_bill_state
14
+ # bills = File.dirname(__FILE__) + "/bills.csv"
15
+ # @links.walk @output
16
+ # assert_equal File.read(File.dirname(__FILE__) + "/expected.csv"), File.read(bills)
17
+ # rm bills
18
+ # end
19
+ end
20
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csvget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kyle Maxwell
@@ -9,27 +9,80 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-10-10 00:00:00 -07:00
12
+ date: 2009-10-16 00:00:00 -07:00
13
13
  default_executable:
14
- dependencies: []
15
-
16
- description:
17
- email:
18
- executables: []
19
-
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: fizx-rwget
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.2.3
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: fizx-parsley-ruby
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.0.0
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: activesupport
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">"
42
+ - !ruby/object:Gem::Version
43
+ version: 0.0.0
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: fastercsv
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.4.0
54
+ version:
55
+ description: Super easy to use (but lots of dependencies :/) parser
56
+ email: kyle@kylemaxwell.com
57
+ executables:
58
+ - csvget
59
+ - jsonget
20
60
  extensions: []
21
61
 
22
- extra_rdoc_files: []
23
-
24
- files: []
25
-
62
+ extra_rdoc_files:
63
+ - LICENSE
64
+ - README.rdoc
65
+ files:
66
+ - .gitignore
67
+ - CHANGELOG
68
+ - LICENSE
69
+ - README.rdoc
70
+ - Rakefile
71
+ - VERSION
72
+ - bin/csvget
73
+ - bin/jsonget
74
+ - csvget.gemspec
75
+ - hn.let
76
+ - lib/csvget.rb
77
+ - lib/jsonget.rb
78
+ - test/csvget_test.rb
26
79
  has_rdoc: true
27
- homepage:
80
+ homepage: http://github.com/fizx/csvget
28
81
  licenses: []
29
82
 
30
83
  post_install_message:
31
- rdoc_options: []
32
-
84
+ rdoc_options:
85
+ - --charset=UTF-8
33
86
  require_paths:
34
87
  - lib
35
88
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -50,6 +103,6 @@ rubyforge_project:
50
103
  rubygems_version: 1.3.4
51
104
  signing_key:
52
105
  specification_version: 3
53
- summary: Placeholder for a gem to be migrated later
54
- test_files: []
55
-
106
+ summary: Uses parselets and rwget to generate csv files from websites
107
+ test_files:
108
+ - test/csvget_test.rb