fizx-csvget 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,2 @@
1
+ *.gem
2
+ *.csv
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 Kyle Maxwell
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,43 @@
1
+ = csvget
2
+
3
+ == Dependencies
4
+
5
+ - http://github.com/fizx/parsley/tree/master and its dependencies.
6
+ - Rubygems
7
+
8
+ == Installation
9
+
10
+ 1. Install the dependencies.
11
+ 2. > gem sources -a http://gems.github.com
12
+ 3. > sudo gem install fizx-csvget
13
+
14
+ == Example Usage
15
+
16
+ > cat hn.let
17
+ {
18
+ "headlines":[{
19
+ "title": ".title a",
20
+ "link": ".title a @href",
21
+ "comments": "match(.subtext a:nth-child(3), '\\d+')",
22
+ "user": ".subtext a:nth-child(2)",
23
+ "score": "match(.subtext span, '\\d+')",
24
+ "time": "match(.subtext, '\\d+\\s+\\w+\\s+ago')"
25
+ }]
26
+ }
27
+ > csvget --directory-prefix=./data -A "/x" -w 5 --parselet=hn.let http://news.ycombinator.com/
28
+ > head data/headlines.csv
29
+ comments,title,time,link,score,user
30
+ 4,Simpson's paradox: why mistrust seemingly simple statistics,2 hours ago,http://en.wikipedia.org/wiki/Simpson%27s_paradox,41,waldrews
31
+ 67,America's unjust sex laws,2 hours ago,http://www.economist.com/opinion/displaystory.cfm?story_id=14165460,59,MikeCapone
32
+ 23,Buy somebody lunch,3 hours ago,http://www.whattofix.com/blog/archives/2009/08/buy-somebody-lu.php,58,DanielBMarkham
33
+ 10,A design pattern is an artifact of a missing feature in your chosen language,3 hours ago,http://www.snell-pym.org.uk/archives/2008/12/29/design-patterns/,31,bensummers
34
+ 4,API changes in Snow Leopard,1 hour ago,http://developer.apple.com/mac/library/releasenotes/MacOSX/WhatsNewInOSX/Articles/MacOSX10_6.html#//apple_ref/doc/uid/TP40008898-SW1,14,pieter
35
+ 16,How to run a linux based home web server,3 hours ago,http://stevehanov.ca/blog/index.php?id=73,28,RiderOfGiraffes
36
+ 1,"OpenCL ""Hello World""",1 hour ago,"http://developer.apple.com/mac/library/documentation/Performance/Conceptual/OpenCL_MacProgGuide/Example:Hello,World/Example:Hello,World.html",8,pieter
37
+ 15,US Senate bill allows White House to disconnect private computers from Internet,4 hours ago,http://news.cnet.com/8301-13578_3-10320096-38.html,35,drewr
38
+ 1,Strategy: Solve Only 80 Percent of the Problem,47 minutes ago,http://highscalability.com/strategy-solve-only-80-percent-problem,6,alrex021
39
+
40
+
41
+ == Copyright
42
+
43
+ Copyright (c) 2009 Kyle Maxwell. See LICENSE for details (MIT).
data/Rakefile ADDED
@@ -0,0 +1,60 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "csvget"
8
+ gem.summary = %Q{Uses parselets and rwget to generate csv files from websites}
9
+ gem.description = %Q{Super easy to use (but lots of dependencies :/) parser}
10
+ gem.email = "kyle@kylemaxwell.com"
11
+ gem.homepage = "http://github.com/fizx/csvget"
12
+ gem.authors = ["Kyle Maxwell"]
13
+ gem.add_dependency("fizx-rwget", ["> 0.2.3"])
14
+ gem.add_dependency("fizx-parsley-ruby", ["> 0.0.0"])
15
+ gem.add_dependency("activesupport", ["> 0.0.0"])
16
+ gem.add_dependency("fastercsv", [">= 1.4.0"])
17
+
18
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
19
+ end
20
+ rescue LoadError
21
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
22
+ end
23
+
24
+ require 'rake/testtask'
25
+ Rake::TestTask.new(:test) do |test|
26
+ test.libs << 'lib' << 'test'
27
+ test.pattern = 'test/**/*_test.rb'
28
+ test.verbose = true
29
+ end
30
+
31
+ begin
32
+ require 'rcov/rcovtask'
33
+ Rcov::RcovTask.new do |test|
34
+ test.libs << 'test'
35
+ test.pattern = 'test/**/*_test.rb'
36
+ test.verbose = true
37
+ end
38
+ rescue LoadError
39
+ task :rcov do
40
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
41
+ end
42
+ end
43
+
44
+ task :test => :check_dependencies
45
+
46
+ task :default => :test
47
+
48
+ require 'rake/rdoctask'
49
+ Rake::RDocTask.new do |rdoc|
50
+ if File.exist?('VERSION')
51
+ version = File.read('VERSION')
52
+ else
53
+ version = ""
54
+ end
55
+
56
+ rdoc.rdoc_dir = 'rdoc'
57
+ rdoc.title = "csvget #{version}"
58
+ rdoc.rdoc_files.include('README*')
59
+ rdoc.rdoc_files.include('lib/**/*.rb')
60
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/bin/csvget ADDED
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+ require "rubygems"
3
+ require File.dirname(__FILE__) + "/../lib/csvget"
4
+ require "rwget"
5
+
6
+ parser = RWGetOptionParser.new do |opts|
7
+ opts.on("--parselet=JSON_FILE", "JSON_FILE is a parselet.") do |path|
8
+ parser.options[:parselets] ||= []
9
+ parser.options[:parselets] << path
10
+ end
11
+ end
12
+ parser.parse!
13
+
14
+ if parser.options[:seeds].empty?
15
+ puts parser.usage
16
+ puts " -h for options listing"
17
+ exit(1)
18
+ end
19
+
20
+ parser.options[:store_class] ||= "CSVStore"
21
+
22
+ controller = RWGet::Controller.new(parser.options)
23
+ begin
24
+ controller.start
25
+ ensure
26
+ STDERR.puts "Closing..."
27
+ controller.close
28
+ end
data/lib/csvget.rb ADDED
@@ -0,0 +1,48 @@
1
+ require "rubygems"
2
+ require "rwget"
3
+ require "parsley"
4
+ require "fastercsv"
5
+ require "activesupport"
6
+ require "fileutils"
7
+
8
+ class CSVStore
9
+ def initialize(options = {})
10
+ @output_folder = options[:prefix] || "."
11
+ FileUtils.mkdir_p(@output_folder)
12
+ @parselets = (options[:parselets] || []).map{|path| Parsley.new(File.read(path)) }
13
+ @files = {}
14
+ @headers = {}
15
+ end
16
+
17
+ def put(host, tmpfile)
18
+ @parselets.each do |parselet|
19
+ begin
20
+ output = parselet.parse(:file => tmpfile.path)
21
+ walk(output)
22
+ rescue ParsleyError => e
23
+ STDERR.puts "warning: #{e.message}"
24
+ end
25
+ end
26
+ end
27
+
28
+ def walk(data, prefix = nil)
29
+ data.each do |prefix, values|
30
+ values = [values] unless values.is_a?(Array)
31
+ file_name = File.join(@output_folder, "#{prefix}.csv")
32
+ h = @headers[prefix] ||= values.first.keys
33
+ f = @files[prefix] ||= FasterCSV.open(file_name, "a", :headers => h, :write_headers => true)
34
+
35
+ values.each do |v|
36
+ f << h.inject([]) do |memo, key|
37
+ memo << v[key]
38
+ end
39
+ end
40
+ end
41
+ end
42
+
43
+ def close
44
+ @files.each do |k, v|
45
+ v.close
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,20 @@
1
+ require "test/unit"
2
+ require File.dirname(__FILE__) + "/../lib/csvget"
3
+ require "fileutils"
4
+
5
+ class CsvgetTest < Test::Unit::TestCase
6
+ include FileUtils
7
+
8
+ # def setup
9
+ # @output = {"bill-state"=>"Welcome to Google Business Solutions", "links"=>["/", "https://adwords.google.com/select/Login?sourceid=awo&subid=us-en-et-bizsol-0-biz1-all&medium=link&hl=en_US"]}
10
+ # @links = ParseletLinks.new(:parselets => File.dirname(__FILE__) + "/foo.let")
11
+ # end
12
+ #
13
+ # def test_bill_state
14
+ # bills = File.dirname(__FILE__) + "/bills.csv"
15
+ # @links.walk @output
16
+ # assert_equal File.read(File.dirname(__FILE__) + "/expected.csv"), File.read(bills)
17
+ # rm bills
18
+ # end
19
+ end
20
+
metadata ADDED
@@ -0,0 +1,101 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fizx-csvget
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Kyle Maxwell
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-28 00:00:00 -07:00
13
+ default_executable: csvget
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: fizx-rwget
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.2.3
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: fizx-parsley-ruby
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.0.0
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: activesupport
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">"
42
+ - !ruby/object:Gem::Version
43
+ version: 0.0.0
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: fastercsv
47
+ type: :runtime
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.4.0
54
+ version:
55
+ description: Super easy to use (but lots of dependencies :/) parser
56
+ email: kyle@kylemaxwell.com
57
+ executables:
58
+ - csvget
59
+ extensions: []
60
+
61
+ extra_rdoc_files:
62
+ - LICENSE
63
+ - README.rdoc
64
+ files:
65
+ - .gitignore
66
+ - LICENSE
67
+ - README.rdoc
68
+ - Rakefile
69
+ - VERSION
70
+ - bin/csvget
71
+ - lib/csvget.rb
72
+ - test/csvget_test.rb
73
+ has_rdoc: false
74
+ homepage: http://github.com/fizx/csvget
75
+ licenses:
76
+ post_install_message:
77
+ rdoc_options:
78
+ - --charset=UTF-8
79
+ require_paths:
80
+ - lib
81
+ required_ruby_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ required_rubygems_version: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - ">="
90
+ - !ruby/object:Gem::Version
91
+ version: "0"
92
+ version:
93
+ requirements: []
94
+
95
+ rubyforge_project:
96
+ rubygems_version: 1.3.5
97
+ signing_key:
98
+ specification_version: 3
99
+ summary: Uses parselets and rwget to generate csv files from websites
100
+ test_files:
101
+ - test/csvget_test.rb