csvget 0.0.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ *.gem
2
+ *.csv
@@ -0,0 +1,8 @@
1
+ 0.3.2 - Sept 10, 2009
2
+ -- Don't write headers when appending to an existing CSV file.
3
+ 0.3.0 - Sept 2, 2009
4
+ -- Added command-line option:
5
+
6
+ --filter=RUBY_CODE RUBY_CODE will be eval'd in context of @row.is_a?(FasterCSV::Row)
7
+
8
+ -- Added CHANGELOG
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Kyle Maxwell
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,93 @@
1
+ = csvget (also, jsonget)
2
+
3
+ == What's new in 0.3.0
4
+
5
+ 1. Added command-line option:
6
+
7
+ --filter=RUBY_CODE RUBY_CODE will be eval'd in context of @row.is_a?(FasterCSV::Row)
8
+
9
+ # Example:
10
+ ./bin/csvget --require=chronic --require=time --filter "@row['time']=Chronic.parse(@row['time']).iso8601" # ...
11
+
12
+ 2. Added CHANGELOG
13
+
14
+ == Dependencies
15
+
16
+ - http://github.com/fizx/parsley/tree/master and its dependencies.
17
+ - Rubygems
18
+
19
+ == Running on EC2
20
+
21
+ > git clone git://github.com/fizx/csvget-ec2-recipe.git
22
+ > cd csvget-ec2-recipe
23
+ > ./boot.rb
24
+ > ssh YOUR_INSTANCE
25
+
26
+ == Local Installation
27
+
28
+ 1. Install the dependencies.
29
+ 2. > gem sources -a http://gems.github.com
30
+ 3. > sudo gem install fizx-csvget
31
+
32
+ == Example Usage
33
+
34
+ > cat hn.let
35
+ {
36
+ "headlines":[{
37
+ "title": ".title a",
38
+ "link": ".title a @href",
39
+ "comments": "match(.subtext a:nth-child(3), '\\d+')",
40
+ "user": ".subtext a:nth-child(2)",
41
+ "score": "match(.subtext span, '\\d+')",
42
+ "time": "match(.subtext, '\\d+\\s+\\w+\\s+ago')"
43
+ }]
44
+ }
45
+ > csvget --directory-prefix=./data -A "/x" -w 5 --parselet=hn.let http://news.ycombinator.com/
46
+ > head data/headlines.csv
47
+ comments,title,time,link,score,user
48
+ 4,Simpson's paradox: why mistrust seemingly simple statistics,2 hours ago,http://en.wikipedia.org/wiki/Simpson%27s_paradox,41,waldrews
49
+ 67,America's unjust sex laws,2 hours ago,http://www.economist.com/opinion/displaystory.cfm?story_id=14165460,59,MikeCapone
50
+ 23,Buy somebody lunch,3 hours ago,http://www.whattofix.com/blog/archives/2009/08/buy-somebody-lu.php,58,DanielBMarkham
51
+ 10,A design pattern is an artifact of a missing feature in your chosen language,3 hours ago,http://www.snell-pym.org.uk/archives/2008/12/29/design-patterns/,31,bensummers
52
+ 4,API changes in Snow Leopard,1 hour ago,http://developer.apple.com/mac/library/releasenotes/MacOSX/WhatsNewInOSX/Articles/MacOSX10_6.html#//apple_ref/doc/uid/TP40008898-SW1,14,pieter
53
+ 16,How to run a linux based home web server,3 hours ago,http://stevehanov.ca/blog/index.php?id=73,28,RiderOfGiraffes
54
+ 1,"OpenCL ""Hello World""",1 hour ago,"http://developer.apple.com/mac/library/documentation/Performance/Conceptual/OpenCL_MacProgGuide/Example:Hello,World/Example:Hello,World.html",8,pieter
55
+ 15,US Senate bill allows White House to disconnect private computers from Internet,4 hours ago,http://news.cnet.com/8301-13578_3-10320096-38.html,35,drewr
56
+ 1,Strategy: Solve Only 80 Percent of the Problem,47 minutes ago,http://highscalability.com/strategy-solve-only-80-percent-problem,6,alrex021
57
+ > csvget -h
58
+ Usage: ./bin/csvget [options] SEED_URL [SEED_URL2 ...]
59
+ --parselet=JSON_FILE JSON_FILE is a parselet.
60
+ -w, --wait=SECONDS wait SECONDS between retrievals.
61
+ -P, --directory-prefix=PREFIX save files to PREFIX/...
62
+ -U, --user-agent=AGENT identify as AGENT instead of RWget/VERSION.
63
+ -A, --accept-pattern=RUBY_REGEX URLs must match RUBY_REGEX to be saved to the queue.
64
+ --time-limit=AMOUNT Crawler will stop after this AMOUNT of time has passed.
65
+ -R, --reject-pattern=RUBY_REGEX URLs must NOT match RUBY_REGEX to be saved to the queue.
66
+ --require=RUBY_SCRIPT Will execute 'require RUBY_SCRIPT'
67
+ --limit-rate=RATE limit download rate to RATE.
68
+ --http-proxy=URL Proxies via URL
69
+ --proxy-user=USER Sets proxy user to USER
70
+ --proxy-password=PASSWORD Sets proxy password to PASSWORD
71
+ --fetch-class=RUBY_CLASS Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]
72
+ --store-class=RUBY_CLASS Must implement put(key_string, temp_file)
73
+ --dupes-class=RUBY_CLASS Must implement dupe?(uri)
74
+ --queue-class=RUBY_CLASS Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]
75
+ --links-class=RUBY_CLASS Must implement urls(base_uri, temp_file) #=> [uri, ...]
76
+ -S, --sitemap=URL URL of a sitemap to crawl (will ignore inter-page links)
77
+ -V, --version
78
+ -Q, --quota=NUMBER set retrieval quota to NUMBER.
79
+ --max-redirect=NUM maximum redirections allowed per page.
80
+ -H, --span-hosts go to foreign hosts when recursive
81
+ --connect-timeout=SECS set the connect timeout to SECS.
82
+ -T, --timeout=SECS set all timeout values to SECONDS.
83
+ -l, --level=NUMBER maximum recursion depth (inf or 0 for infinite).
84
+ --[no-]timestampize Prepend the timestamp of when the crawl started to the directory structure.
85
+ --incremental-from=PREVIOUS Build upon the indexing already saved in PREVIOUS.
86
+ --protocol-directories use protocol name in directories.
87
+ --no-host-directories don't create host directories.
88
+ -v, --[no-]verbose Run verbosely
89
+ -h, --help Show this message
90
+
91
+ == Copyright
92
+
93
+ Copyright (c) 2009 Kyle Maxwell. See LICENSE for details (MIT).
@@ -0,0 +1,62 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "csvget"
8
+ gem.executables = ["csvget", "jsonget"]
9
+ gem.summary = %Q{Uses parselets and rwget to generate csv files from websites}
10
+ gem.description = %Q{Super easy to use (but lots of dependencies :/) parser}
11
+ gem.email = "kyle@kylemaxwell.com"
12
+ gem.homepage = "http://github.com/fizx/csvget"
13
+ gem.authors = ["Kyle Maxwell"]
14
+ gem.add_dependency("fizx-rwget", ["> 0.2.3"])
15
+ gem.add_dependency("fizx-parsley-ruby", ["> 0.0.0"])
16
+ gem.add_dependency("activesupport", ["> 0.0.0"])
17
+ gem.add_dependency("fastercsv", [">= 1.4.0"])
18
+
19
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
20
+ end
21
+ Jeweler::GemcutterTasks.new
22
+ rescue LoadError
23
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
24
+ end
25
+
26
+ require 'rake/testtask'
27
+ Rake::TestTask.new(:test) do |test|
28
+ test.libs << 'lib' << 'test'
29
+ test.pattern = 'test/**/*_test.rb'
30
+ test.verbose = true
31
+ end
32
+
33
+ begin
34
+ require 'rcov/rcovtask'
35
+ Rcov::RcovTask.new do |test|
36
+ test.libs << 'test'
37
+ test.pattern = 'test/**/*_test.rb'
38
+ test.verbose = true
39
+ end
40
+ rescue LoadError
41
+ task :rcov do
42
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
43
+ end
44
+ end
45
+
46
+ task :test => :check_dependencies
47
+
48
+ task :default => :test
49
+
50
+ require 'rake/rdoctask'
51
+ Rake::RDocTask.new do |rdoc|
52
+ if File.exist?('VERSION')
53
+ version = File.read('VERSION')
54
+ else
55
+ version = ""
56
+ end
57
+
58
+ rdoc.rdoc_dir = 'rdoc'
59
+ rdoc.title = "csvget #{version}"
60
+ rdoc.rdoc_files.include('README*')
61
+ rdoc.rdoc_files.include('lib/**/*.rb')
62
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.4.0
@@ -0,0 +1,34 @@
1
+ #!/usr/bin/env ruby
2
+ require "rubygems"
3
+ require File.dirname(__FILE__) + "/../lib/csvget"
4
+ require "rwget"
5
+
6
+ parser = RWGetOptionParser.new do |opts|
7
+ opts.on("--parselet=JSON_FILE", "JSON_FILE is a parselet.") do |path|
8
+ parser.options[:parselets] ||= []
9
+ parser.options[:parselets] << path
10
+ end
11
+
12
+ opts.on("--filter=RUBY_CODE", "RUBY_CODE will be eval'd in context of @row.is_a?(FasterCSV::Row)") do |filter|
13
+ parser.options[:filter] ||= []
14
+ parser.options[:filter] << filter
15
+ end
16
+
17
+ end
18
+ parser.parse!
19
+
20
+ if parser.options[:seeds].empty?
21
+ puts parser.usage
22
+ puts " -h for options listing"
23
+ exit(1)
24
+ end
25
+
26
+ parser.options[:store_class] ||= "CSVStore"
27
+
28
+ controller = RWGet::Controller.new(parser.options)
29
+ begin
30
+ controller.start
31
+ ensure
32
+ STDERR.puts "Closing..."
33
+ controller.close
34
+ end
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+ require "rubygems"
3
+ require File.dirname(__FILE__) + "/../lib/jsonget"
4
+ require "rwget"
5
+
6
+ parser = RWGetOptionParser.new do |opts|
7
+ opts.on("--parselet=JSON_FILE", "JSON_FILE is a parselet.") do |path|
8
+ parser.options[:parselets] ||= []
9
+ parser.options[:parselets] << path
10
+ end
11
+ end
12
+ parser.parse!
13
+
14
+ if parser.options[:seeds].empty?
15
+ puts parser.usage
16
+ puts " -h for options listing"
17
+ exit(1)
18
+ end
19
+
20
+ parser.options[:store_class] ||= "JSONStore"
21
+
22
+ controller = RWGet::Controller.new(parser.options)
23
+ begin
24
+ controller.start
25
+ ensure
26
+ STDERR.puts "Closing..."
27
+ controller.close
28
+ end
@@ -0,0 +1,65 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{csvget}
8
+ s.version = "0.4.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["Kyle Maxwell"]
12
+ s.date = %q{2009-10-16}
13
+ s.description = %q{Super easy to use (but lots of dependencies :/) parser}
14
+ s.email = %q{kyle@kylemaxwell.com}
15
+ s.executables = ["csvget", "jsonget"]
16
+ s.extra_rdoc_files = [
17
+ "LICENSE",
18
+ "README.rdoc"
19
+ ]
20
+ s.files = [
21
+ ".gitignore",
22
+ "CHANGELOG",
23
+ "LICENSE",
24
+ "README.rdoc",
25
+ "Rakefile",
26
+ "VERSION",
27
+ "bin/csvget",
28
+ "bin/jsonget",
29
+ "csvget.gemspec",
30
+ "hn.let",
31
+ "lib/csvget.rb",
32
+ "lib/jsonget.rb",
33
+ "test/csvget_test.rb"
34
+ ]
35
+ s.homepage = %q{http://github.com/fizx/csvget}
36
+ s.rdoc_options = ["--charset=UTF-8"]
37
+ s.require_paths = ["lib"]
38
+ s.rubygems_version = %q{1.3.4}
39
+ s.summary = %q{Uses parselets and rwget to generate csv files from websites}
40
+ s.test_files = [
41
+ "test/csvget_test.rb"
42
+ ]
43
+
44
+ if s.respond_to? :specification_version then
45
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
46
+ s.specification_version = 3
47
+
48
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
49
+ s.add_runtime_dependency(%q<fizx-rwget>, ["> 0.2.3"])
50
+ s.add_runtime_dependency(%q<fizx-parsley-ruby>, ["> 0.0.0"])
51
+ s.add_runtime_dependency(%q<activesupport>, ["> 0.0.0"])
52
+ s.add_runtime_dependency(%q<fastercsv>, [">= 1.4.0"])
53
+ else
54
+ s.add_dependency(%q<fizx-rwget>, ["> 0.2.3"])
55
+ s.add_dependency(%q<fizx-parsley-ruby>, ["> 0.0.0"])
56
+ s.add_dependency(%q<activesupport>, ["> 0.0.0"])
57
+ s.add_dependency(%q<fastercsv>, [">= 1.4.0"])
58
+ end
59
+ else
60
+ s.add_dependency(%q<fizx-rwget>, ["> 0.2.3"])
61
+ s.add_dependency(%q<fizx-parsley-ruby>, ["> 0.0.0"])
62
+ s.add_dependency(%q<activesupport>, ["> 0.0.0"])
63
+ s.add_dependency(%q<fastercsv>, [">= 1.4.0"])
64
+ end
65
+ end
data/hn.let ADDED
@@ -0,0 +1,10 @@
1
+ {
2
+ "headlines":[{
3
+ "title": ".title a",
4
+ "link": ".title a @href",
5
+ "comments": "match(.subtext a:nth-child(3), '\\d+')",
6
+ "user": ".subtext a:nth-child(2)",
7
+ "score": "match(.subtext span, '\\d+')",
8
+ "time": "match(.subtext, '\\d+\\s+\\w+\\s+ago')"
9
+ }]
10
+ }
@@ -0,0 +1,54 @@
1
+ require "rubygems"
2
+ require "rwget"
3
+ require "parsley"
4
+ require "fastercsv"
5
+ require "activesupport"
6
+ require "fileutils"
7
+
8
+ class CSVStore
9
+ def initialize(options = {})
10
+ @output_folder = options[:prefix] || "."
11
+ @filters = options[:filter] || []
12
+ FileUtils.mkdir_p(@output_folder)
13
+ @parselets = (options[:parselets] || []).map{|path| Parsley.new(File.read(path)) }
14
+ @files = {}
15
+ @headers = {}
16
+ end
17
+
18
+ def put(host, tmpfile)
19
+ @parselets.each do |parselet|
20
+ begin
21
+ type = (`file "#{tmpfile.path}"` =~ /xml/i) ? :xml : :html
22
+ output = parselet.parse(:file => tmpfile.path, :input => type)
23
+ walk(output)
24
+ rescue ParsleyError => e
25
+ STDERR.puts "warning: #{e.message}"
26
+ end
27
+ end
28
+ end
29
+
30
+ def walk(data, prefix = nil)
31
+ data.each do |prefix, values|
32
+ values = [values] unless values.is_a?(Array)
33
+ file_name = File.join(@output_folder, "#{prefix}.csv")
34
+ h = @headers[prefix] ||= values.first.keys
35
+ should_write_headers = !File.exists?(file_name)
36
+ f = @files[prefix] ||= FasterCSV.open(file_name, "a", :headers => h, :write_headers => should_write_headers)
37
+
38
+ values.each do |hash|
39
+ arr = h.inject([]) do |memo, key|
40
+ memo << hash[key]
41
+ end
42
+ @row = FasterCSV::Row.new(h, arr)
43
+ @filters.each {|filter| eval(filter) }
44
+ f << @row
45
+ end
46
+ end
47
+ end
48
+
49
+ def close
50
+ @files.each do |k, v|
51
+ v.close
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,29 @@
1
+ require "rubygems"
2
+ require "rwget"
3
+ require "parsley"
4
+ require "activesupport"
5
+ require "fileutils"
6
+
7
+ class JSONStore
8
+ def initialize(options = {})
9
+ @output_folder = options[:prefix] || "."
10
+ FileUtils.mkdir_p(@output_folder)
11
+ @parselets = (options[:parselets] || []).map{|path| Parsley.new(File.read(path)) }
12
+ @files = (options[:parselets] || []).map{|path| File.open("#{File.basename(path)}.json", "a") }
13
+ end
14
+
15
+ def put(host, tmpfile)
16
+ @parselets.zip(@files).each do |parselet, file|
17
+ begin
18
+ type = (`file "#{tmpfile.path}"` =~ /xml/i) ? :xml : :html
19
+ output = parselet.parse(:file => tmpfile.path, :input => type, :output => :json) + ","
20
+ rescue ParsleyError => e
21
+ STDERR.puts "warning: #{e.message}"
22
+ end
23
+ end
24
+ end
25
+
26
+ def close
27
+ @files.map(&:close)
28
+ end
29
+ end
@@ -0,0 +1,20 @@
1
+ require "test/unit"
2
+ require File.dirname(__FILE__) + "/../lib/csvget"
3
+ require "fileutils"
4
+
5
+ class CsvgetTest < Test::Unit::TestCase
6
+ include FileUtils
7
+
8
+ # def setup
9
+ # @output = {"bill-state"=>"Welcome to Google Business Solutions", "links"=>["/", "https://adwords.google.com/select/Login?sourceid=awo&subid=us-en-et-bizsol-0-biz1-all&medium=link&hl=en_US"]}
10
+ # @links = ParseletLinks.new(:parselets => File.dirname(__FILE__) + "/foo.let")
11
+ # end
12
+ #
13
+ # def test_bill_state
14
+ # bills = File.dirname(__FILE__) + "/bills.csv"
15
+ # @links.walk @output
16
+ # assert_equal File.read(File.dirname(__FILE__) + "/expected.csv"), File.read(bills)
17
+ # rm bills
18
+ # end
19
+ end
20
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csvget
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kyle Maxwell
@@ -9,27 +9,80 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2009-10-10 00:00:00 -07:00
12
+ date: 2009-10-16 00:00:00 -07:00
13
13
  default_executable:
14
- dependencies: []
15
-
16
- description:
17
- email:
18
- executables: []
19
-
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: fizx-rwget
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.2.3
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: fizx-parsley-ruby
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.0.0
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: activesupport
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">"
42
+ - !ruby/object:Gem::Version
43
+ version: 0.0.0
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: fastercsv
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.4.0
54
+ version:
55
+ description: Super easy to use (but lots of dependencies :/) parser
56
+ email: kyle@kylemaxwell.com
57
+ executables:
58
+ - csvget
59
+ - jsonget
20
60
  extensions: []
21
61
 
22
- extra_rdoc_files: []
23
-
24
- files: []
25
-
62
+ extra_rdoc_files:
63
+ - LICENSE
64
+ - README.rdoc
65
+ files:
66
+ - .gitignore
67
+ - CHANGELOG
68
+ - LICENSE
69
+ - README.rdoc
70
+ - Rakefile
71
+ - VERSION
72
+ - bin/csvget
73
+ - bin/jsonget
74
+ - csvget.gemspec
75
+ - hn.let
76
+ - lib/csvget.rb
77
+ - lib/jsonget.rb
78
+ - test/csvget_test.rb
26
79
  has_rdoc: true
27
- homepage:
80
+ homepage: http://github.com/fizx/csvget
28
81
  licenses: []
29
82
 
30
83
  post_install_message:
31
- rdoc_options: []
32
-
84
+ rdoc_options:
85
+ - --charset=UTF-8
33
86
  require_paths:
34
87
  - lib
35
88
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -50,6 +103,6 @@ rubyforge_project:
50
103
  rubygems_version: 1.3.4
51
104
  signing_key:
52
105
  specification_version: 3
53
- summary: Placeholder for a gem to be migrated later
54
- test_files: []
55
-
106
+ summary: Uses parselets and rwget to generate csv files from websites
107
+ test_files:
108
+ - test/csvget_test.rb