fizx-csvget 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +2 -0
- data/LICENSE +20 -0
- data/README.rdoc +43 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/bin/csvget +28 -0
- data/lib/csvget.rb +48 -0
- data/test/csvget_test.rb +20 -0
- metadata +101 -0
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Kyle Maxwell
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
= csvget
|
2
|
+
|
3
|
+
== Dependencies
|
4
|
+
|
5
|
+
- http://github.com/fizx/parsley/tree/master and its dependencies.
|
6
|
+
- Rubygems
|
7
|
+
|
8
|
+
== Installation
|
9
|
+
|
10
|
+
1. Install the dependencies.
|
11
|
+
2. > gem sources -a http://gems.github.com
|
12
|
+
3. > sudo gem install fizx-csvget
|
13
|
+
|
14
|
+
== Example Usage
|
15
|
+
|
16
|
+
> cat hn.let
|
17
|
+
{
|
18
|
+
"headlines":[{
|
19
|
+
"title": ".title a",
|
20
|
+
"link": ".title a @href",
|
21
|
+
"comments": "match(.subtext a:nth-child(3), '\\d+')",
|
22
|
+
"user": ".subtext a:nth-child(2)",
|
23
|
+
"score": "match(.subtext span, '\\d+')",
|
24
|
+
"time": "match(.subtext, '\\d+\\s+\\w+\\s+ago')"
|
25
|
+
}]
|
26
|
+
}
|
27
|
+
> csvget --directory-prefix=./data -A "/x" -w 5 --parselet=hn.let http://news.ycombinator.com/
|
28
|
+
> head data/headlines.csv
|
29
|
+
comments,title,time,link,score,user
|
30
|
+
4,Simpson's paradox: why mistrust seemingly simple statistics,2 hours ago,http://en.wikipedia.org/wiki/Simpson%27s_paradox,41,waldrews
|
31
|
+
67,America's unjust sex laws,2 hours ago,http://www.economist.com/opinion/displaystory.cfm?story_id=14165460,59,MikeCapone
|
32
|
+
23,Buy somebody lunch,3 hours ago,http://www.whattofix.com/blog/archives/2009/08/buy-somebody-lu.php,58,DanielBMarkham
|
33
|
+
10,A design pattern is an artifact of a missing feature in your chosen language,3 hours ago,http://www.snell-pym.org.uk/archives/2008/12/29/design-patterns/,31,bensummers
|
34
|
+
4,API changes in Snow Leopard,1 hour ago,http://developer.apple.com/mac/library/releasenotes/MacOSX/WhatsNewInOSX/Articles/MacOSX10_6.html#//apple_ref/doc/uid/TP40008898-SW1,14,pieter
|
35
|
+
16,How to run a linux based home web server,3 hours ago,http://stevehanov.ca/blog/index.php?id=73,28,RiderOfGiraffes
|
36
|
+
1,"OpenCL ""Hello World""",1 hour ago,"http://developer.apple.com/mac/library/documentation/Performance/Conceptual/OpenCL_MacProgGuide/Example:Hello,World/Example:Hello,World.html",8,pieter
|
37
|
+
15,US Senate bill allows White House to disconnect private computers from Internet,4 hours ago,http://news.cnet.com/8301-13578_3-10320096-38.html,35,drewr
|
38
|
+
1,Strategy: Solve Only 80 Percent of the Problem,47 minutes ago,http://highscalability.com/strategy-solve-only-80-percent-problem,6,alrex021
|
39
|
+
|
40
|
+
|
41
|
+
== Copyright
|
42
|
+
|
43
|
+
Copyright (c) 2009 Kyle Maxwell. See LICENSE for details (MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "csvget"
|
8
|
+
gem.summary = %Q{Uses parselets and rwget to generate csv files from websites}
|
9
|
+
gem.description = %Q{Super easy to use (but lots of dependencies :/) parser}
|
10
|
+
gem.email = "kyle@kylemaxwell.com"
|
11
|
+
gem.homepage = "http://github.com/fizx/csvget"
|
12
|
+
gem.authors = ["Kyle Maxwell"]
|
13
|
+
gem.add_dependency("fizx-rwget", ["> 0.2.3"])
|
14
|
+
gem.add_dependency("fizx-parsley-ruby", ["> 0.0.0"])
|
15
|
+
gem.add_dependency("activesupport", ["> 0.0.0"])
|
16
|
+
gem.add_dependency("fastercsv", [">= 1.4.0"])
|
17
|
+
|
18
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
19
|
+
end
|
20
|
+
rescue LoadError
|
21
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
22
|
+
end
|
23
|
+
|
24
|
+
require 'rake/testtask'
|
25
|
+
Rake::TestTask.new(:test) do |test|
|
26
|
+
test.libs << 'lib' << 'test'
|
27
|
+
test.pattern = 'test/**/*_test.rb'
|
28
|
+
test.verbose = true
|
29
|
+
end
|
30
|
+
|
31
|
+
begin
|
32
|
+
require 'rcov/rcovtask'
|
33
|
+
Rcov::RcovTask.new do |test|
|
34
|
+
test.libs << 'test'
|
35
|
+
test.pattern = 'test/**/*_test.rb'
|
36
|
+
test.verbose = true
|
37
|
+
end
|
38
|
+
rescue LoadError
|
39
|
+
task :rcov do
|
40
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
task :test => :check_dependencies
|
45
|
+
|
46
|
+
task :default => :test
|
47
|
+
|
48
|
+
require 'rake/rdoctask'
|
49
|
+
Rake::RDocTask.new do |rdoc|
|
50
|
+
if File.exist?('VERSION')
|
51
|
+
version = File.read('VERSION')
|
52
|
+
else
|
53
|
+
version = ""
|
54
|
+
end
|
55
|
+
|
56
|
+
rdoc.rdoc_dir = 'rdoc'
|
57
|
+
rdoc.title = "csvget #{version}"
|
58
|
+
rdoc.rdoc_files.include('README*')
|
59
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
60
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/bin/csvget
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require "rubygems"
|
3
|
+
require File.dirname(__FILE__) + "/../lib/csvget"
|
4
|
+
require "rwget"
|
5
|
+
|
6
|
+
parser = RWGetOptionParser.new do |opts|
|
7
|
+
opts.on("--parselet=JSON_FILE", "JSON_FILE is a parselet.") do |path|
|
8
|
+
parser.options[:parselets] ||= []
|
9
|
+
parser.options[:parselets] << path
|
10
|
+
end
|
11
|
+
end
|
12
|
+
parser.parse!
|
13
|
+
|
14
|
+
if parser.options[:seeds].empty?
|
15
|
+
puts parser.usage
|
16
|
+
puts " -h for options listing"
|
17
|
+
exit(1)
|
18
|
+
end
|
19
|
+
|
20
|
+
parser.options[:store_class] ||= "CSVStore"
|
21
|
+
|
22
|
+
controller = RWGet::Controller.new(parser.options)
|
23
|
+
begin
|
24
|
+
controller.start
|
25
|
+
ensure
|
26
|
+
STDERR.puts "Closing..."
|
27
|
+
controller.close
|
28
|
+
end
|
data/lib/csvget.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "rwget"
|
3
|
+
require "parsley"
|
4
|
+
require "fastercsv"
|
5
|
+
require "activesupport"
|
6
|
+
require "fileutils"
|
7
|
+
|
8
|
+
class CSVStore
|
9
|
+
def initialize(options = {})
|
10
|
+
@output_folder = options[:prefix] || "."
|
11
|
+
FileUtils.mkdir_p(@output_folder)
|
12
|
+
@parselets = (options[:parselets] || []).map{|path| Parsley.new(File.read(path)) }
|
13
|
+
@files = {}
|
14
|
+
@headers = {}
|
15
|
+
end
|
16
|
+
|
17
|
+
def put(host, tmpfile)
|
18
|
+
@parselets.each do |parselet|
|
19
|
+
begin
|
20
|
+
output = parselet.parse(:file => tmpfile.path)
|
21
|
+
walk(output)
|
22
|
+
rescue ParsleyError => e
|
23
|
+
STDERR.puts "warning: #{e.message}"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def walk(data, prefix = nil)
|
29
|
+
data.each do |prefix, values|
|
30
|
+
values = [values] unless values.is_a?(Array)
|
31
|
+
file_name = File.join(@output_folder, "#{prefix}.csv")
|
32
|
+
h = @headers[prefix] ||= values.first.keys
|
33
|
+
f = @files[prefix] ||= FasterCSV.open(file_name, "a", :headers => h, :write_headers => true)
|
34
|
+
|
35
|
+
values.each do |v|
|
36
|
+
f << h.inject([]) do |memo, key|
|
37
|
+
memo << v[key]
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def close
|
44
|
+
@files.each do |k, v|
|
45
|
+
v.close
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
data/test/csvget_test.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require File.dirname(__FILE__) + "/../lib/csvget"
|
3
|
+
require "fileutils"
|
4
|
+
|
5
|
+
class CsvgetTest < Test::Unit::TestCase
|
6
|
+
include FileUtils
|
7
|
+
|
8
|
+
# def setup
|
9
|
+
# @output = {"bill-state"=>"Welcome to Google Business Solutions", "links"=>["/", "https://adwords.google.com/select/Login?sourceid=awo&subid=us-en-et-bizsol-0-biz1-all&medium=link&hl=en_US"]}
|
10
|
+
# @links = ParseletLinks.new(:parselets => File.dirname(__FILE__) + "/foo.let")
|
11
|
+
# end
|
12
|
+
#
|
13
|
+
# def test_bill_state
|
14
|
+
# bills = File.dirname(__FILE__) + "/bills.csv"
|
15
|
+
# @links.walk @output
|
16
|
+
# assert_equal File.read(File.dirname(__FILE__) + "/expected.csv"), File.read(bills)
|
17
|
+
# rm bills
|
18
|
+
# end
|
19
|
+
end
|
20
|
+
|
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: fizx-csvget
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Kyle Maxwell
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-08-28 00:00:00 -07:00
|
13
|
+
default_executable: csvget
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: fizx-rwget
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.2.3
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: fizx-parsley-ruby
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.0.0
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: activesupport
|
37
|
+
type: :runtime
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ">"
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.0.0
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: fastercsv
|
47
|
+
type: :runtime
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.4.0
|
54
|
+
version:
|
55
|
+
description: Super easy to use (but lots of dependencies :/) parser
|
56
|
+
email: kyle@kylemaxwell.com
|
57
|
+
executables:
|
58
|
+
- csvget
|
59
|
+
extensions: []
|
60
|
+
|
61
|
+
extra_rdoc_files:
|
62
|
+
- LICENSE
|
63
|
+
- README.rdoc
|
64
|
+
files:
|
65
|
+
- .gitignore
|
66
|
+
- LICENSE
|
67
|
+
- README.rdoc
|
68
|
+
- Rakefile
|
69
|
+
- VERSION
|
70
|
+
- bin/csvget
|
71
|
+
- lib/csvget.rb
|
72
|
+
- test/csvget_test.rb
|
73
|
+
has_rdoc: false
|
74
|
+
homepage: http://github.com/fizx/csvget
|
75
|
+
licenses:
|
76
|
+
post_install_message:
|
77
|
+
rdoc_options:
|
78
|
+
- --charset=UTF-8
|
79
|
+
require_paths:
|
80
|
+
- lib
|
81
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: "0"
|
86
|
+
version:
|
87
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
88
|
+
requirements:
|
89
|
+
- - ">="
|
90
|
+
- !ruby/object:Gem::Version
|
91
|
+
version: "0"
|
92
|
+
version:
|
93
|
+
requirements: []
|
94
|
+
|
95
|
+
rubyforge_project:
|
96
|
+
rubygems_version: 1.3.5
|
97
|
+
signing_key:
|
98
|
+
specification_version: 3
|
99
|
+
summary: Uses parselets and rwget to generate csv files from websites
|
100
|
+
test_files:
|
101
|
+
- test/csvget_test.rb
|