fizx-csvget 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ *.gem
2
+ *.csv
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Kyle Maxwell
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,43 @@
1
+ = csvget
2
+
3
+ == Dependencies
4
+
5
+ - http://github.com/fizx/parsley/tree/master and its dependencies.
6
+ - Rubygems
7
+
8
+ == Installation
9
+
10
+ 1. Install the dependencies.
11
+ 2. > gem sources -a http://gems.github.com
12
+ 3. > sudo gem install fizx-csvget
13
+
14
+ == Example Usage
15
+
16
+ > cat hn.let
17
+ {
18
+ "headlines":[{
19
+ "title": ".title a",
20
+ "link": ".title a @href",
21
+ "comments": "match(.subtext a:nth-child(3), '\\d+')",
22
+ "user": ".subtext a:nth-child(2)",
23
+ "score": "match(.subtext span, '\\d+')",
24
+ "time": "match(.subtext, '\\d+\\s+\\w+\\s+ago')"
25
+ }]
26
+ }
27
+ > csvget --directory-prefix=./data -A "/x" -w 5 --parselet=hn.let http://news.ycombinator.com/
28
+ > head data/headlines.csv
29
+ comments,title,time,link,score,user
30
+ 4,Simpson's paradox: why mistrust seemingly simple statistics,2 hours ago,http://en.wikipedia.org/wiki/Simpson%27s_paradox,41,waldrews
31
+ 67,America's unjust sex laws,2 hours ago,http://www.economist.com/opinion/displaystory.cfm?story_id=14165460,59,MikeCapone
32
+ 23,Buy somebody lunch,3 hours ago,http://www.whattofix.com/blog/archives/2009/08/buy-somebody-lu.php,58,DanielBMarkham
33
+ 10,A design pattern is an artifact of a missing feature in your chosen language,3 hours ago,http://www.snell-pym.org.uk/archives/2008/12/29/design-patterns/,31,bensummers
34
+ 4,API changes in Snow Leopard,1 hour ago,http://developer.apple.com/mac/library/releasenotes/MacOSX/WhatsNewInOSX/Articles/MacOSX10_6.html#//apple_ref/doc/uid/TP40008898-SW1,14,pieter
35
+ 16,How to run a linux based home web server,3 hours ago,http://stevehanov.ca/blog/index.php?id=73,28,RiderOfGiraffes
36
+ 1,"OpenCL ""Hello World""",1 hour ago,"http://developer.apple.com/mac/library/documentation/Performance/Conceptual/OpenCL_MacProgGuide/Example:Hello,World/Example:Hello,World.html",8,pieter
37
+ 15,US Senate bill allows White House to disconnect private computers from Internet,4 hours ago,http://news.cnet.com/8301-13578_3-10320096-38.html,35,drewr
38
+ 1,Strategy: Solve Only 80 Percent of the Problem,47 minutes ago,http://highscalability.com/strategy-solve-only-80-percent-problem,6,alrex021
39
+
40
+
41
+ == Copyright
42
+
43
+ Copyright (c) 2009 Kyle Maxwell. See LICENSE for details (MIT).
data/Rakefile ADDED
@@ -0,0 +1,60 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "csvget"
8
+ gem.summary = %Q{Uses parselets and rwget to generate csv files from websites}
9
+ gem.description = %Q{Super easy to use (but lots of dependencies :/) parser}
10
+ gem.email = "kyle@kylemaxwell.com"
11
+ gem.homepage = "http://github.com/fizx/csvget"
12
+ gem.authors = ["Kyle Maxwell"]
13
+ gem.add_dependency("fizx-rwget", ["> 0.2.3"])
14
+ gem.add_dependency("fizx-parsley-ruby", ["> 0.0.0"])
15
+ gem.add_dependency("activesupport", ["> 0.0.0"])
16
+ gem.add_dependency("fastercsv", [">= 1.4.0"])
17
+
18
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
19
+ end
20
+ rescue LoadError
21
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
22
+ end
23
+
24
+ require 'rake/testtask'
25
+ Rake::TestTask.new(:test) do |test|
26
+ test.libs << 'lib' << 'test'
27
+ test.pattern = 'test/**/*_test.rb'
28
+ test.verbose = true
29
+ end
30
+
31
+ begin
32
+ require 'rcov/rcovtask'
33
+ Rcov::RcovTask.new do |test|
34
+ test.libs << 'test'
35
+ test.pattern = 'test/**/*_test.rb'
36
+ test.verbose = true
37
+ end
38
+ rescue LoadError
39
+ task :rcov do
40
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
41
+ end
42
+ end
43
+
44
+ task :test => :check_dependencies
45
+
46
+ task :default => :test
47
+
48
+ require 'rake/rdoctask'
49
+ Rake::RDocTask.new do |rdoc|
50
+ if File.exist?('VERSION')
51
+ version = File.read('VERSION')
52
+ else
53
+ version = ""
54
+ end
55
+
56
+ rdoc.rdoc_dir = 'rdoc'
57
+ rdoc.title = "csvget #{version}"
58
+ rdoc.rdoc_files.include('README*')
59
+ rdoc.rdoc_files.include('lib/**/*.rb')
60
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/bin/csvget ADDED
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+ require "rubygems"
3
+ require File.dirname(__FILE__) + "/../lib/csvget"
4
+ require "rwget"
5
+
6
+ parser = RWGetOptionParser.new do |opts|
7
+ opts.on("--parselet=JSON_FILE", "JSON_FILE is a parselet.") do |path|
8
+ parser.options[:parselets] ||= []
9
+ parser.options[:parselets] << path
10
+ end
11
+ end
12
+ parser.parse!
13
+
14
+ if parser.options[:seeds].empty?
15
+ puts parser.usage
16
+ puts " -h for options listing"
17
+ exit(1)
18
+ end
19
+
20
+ parser.options[:store_class] ||= "CSVStore"
21
+
22
+ controller = RWGet::Controller.new(parser.options)
23
+ begin
24
+ controller.start
25
+ ensure
26
+ STDERR.puts "Closing..."
27
+ controller.close
28
+ end
data/lib/csvget.rb ADDED
@@ -0,0 +1,48 @@
1
+ require "rubygems"
2
+ require "rwget"
3
+ require "parsley"
4
+ require "fastercsv"
5
+ require "activesupport"
6
+ require "fileutils"
7
+
8
+ class CSVStore
9
+ def initialize(options = {})
10
+ @output_folder = options[:prefix] || "."
11
+ FileUtils.mkdir_p(@output_folder)
12
+ @parselets = (options[:parselets] || []).map{|path| Parsley.new(File.read(path)) }
13
+ @files = {}
14
+ @headers = {}
15
+ end
16
+
17
+ def put(host, tmpfile)
18
+ @parselets.each do |parselet|
19
+ begin
20
+ output = parselet.parse(:file => tmpfile.path)
21
+ walk(output)
22
+ rescue ParsleyError => e
23
+ STDERR.puts "warning: #{e.message}"
24
+ end
25
+ end
26
+ end
27
+
28
+ def walk(data, prefix = nil)
29
+ data.each do |prefix, values|
30
+ values = [values] unless values.is_a?(Array)
31
+ file_name = File.join(@output_folder, "#{prefix}.csv")
32
+ h = @headers[prefix] ||= values.first.keys
33
+ f = @files[prefix] ||= FasterCSV.open(file_name, "a", :headers => h, :write_headers => true)
34
+
35
+ values.each do |v|
36
+ f << h.inject([]) do |memo, key|
37
+ memo << v[key]
38
+ end
39
+ end
40
+ end
41
+ end
42
+
43
+ def close
44
+ @files.each do |k, v|
45
+ v.close
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,20 @@
1
+ require "test/unit"
2
+ require File.dirname(__FILE__) + "/../lib/csvget"
3
+ require "fileutils"
4
+
5
+ class CsvgetTest < Test::Unit::TestCase
6
+ include FileUtils
7
+
8
+ # def setup
9
+ # @output = {"bill-state"=>"Welcome to Google Business Solutions", "links"=>["/", "https://adwords.google.com/select/Login?sourceid=awo&subid=us-en-et-bizsol-0-biz1-all&medium=link&hl=en_US"]}
10
+ # @links = ParseletLinks.new(:parselets => File.dirname(__FILE__) + "/foo.let")
11
+ # end
12
+ #
13
+ # def test_bill_state
14
+ # bills = File.dirname(__FILE__) + "/bills.csv"
15
+ # @links.walk @output
16
+ # assert_equal File.read(File.dirname(__FILE__) + "/expected.csv"), File.read(bills)
17
+ # rm bills
18
+ # end
19
+ end
20
+
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fizx-csvget
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Kyle Maxwell
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-28 00:00:00 -07:00
13
+ default_executable: csvget
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: fizx-rwget
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.2.3
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: fizx-parsley-ruby
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.0.0
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: activesupport
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">"
42
+ - !ruby/object:Gem::Version
43
+ version: 0.0.0
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: fastercsv
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.4.0
54
+ version:
55
+ description: Super easy to use (but lots of dependencies :/) parser
56
+ email: kyle@kylemaxwell.com
57
+ executables:
58
+ - csvget
59
+ extensions: []
60
+
61
+ extra_rdoc_files:
62
+ - LICENSE
63
+ - README.rdoc
64
+ files:
65
+ - .gitignore
66
+ - LICENSE
67
+ - README.rdoc
68
+ - Rakefile
69
+ - VERSION
70
+ - bin/csvget
71
+ - lib/csvget.rb
72
+ - test/csvget_test.rb
73
+ has_rdoc: false
74
+ homepage: http://github.com/fizx/csvget
75
+ licenses:
76
+ post_install_message:
77
+ rdoc_options:
78
+ - --charset=UTF-8
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: "0"
92
+ version:
93
+ requirements: []
94
+
95
+ rubyforge_project:
96
+ rubygems_version: 1.3.5
97
+ signing_key:
98
+ specification_version: 3
99
+ summary: Uses parselets and rwget to generate csv files from websites
100
+ test_files:
101
+ - test/csvget_test.rb