podcsv 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +111 -0
- data/Rakefile +12 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/lib/podcsv/version.rb +3 -0
- data/lib/podcsv.rb +212 -0
- data/podcsv.gemspec +33 -0
- metadata +101 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c2063aefdf28a7cc548de6b13a26c4aac2c1383e
|
4
|
+
data.tar.gz: ce128de96cd979ade07e762f509b486fcda30948
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e86f4aa504004fe9dac8d9874264cb351c3c8ed30d1416fa75116ff55a858a13c077687eb14d99e0f1d8ee03517d79b5fb1948f10b16fd09e979ddb38b22b87b
|
7
|
+
data.tar.gz: 3684b1c99503f9a5c32bb6730dd68c07256c94d62b99dbc399beb875013a1853ba835358a27a19a4357f241b5c1cba17b1c300765c1943a0dbceba2bea71c2e0
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 YAMAMOTO, Masayuki
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
# PodCSV another lazy solution for `csv`
|
2
|
+
|
3
|
+
podcsv.gem provides fast access to big CSV file such as several 10 thousand
|
4
|
+
records. This introduces two classes: Parse-on-demand CSV and Array (PodCSV,
|
5
|
+
PodArray) and is around 10 times faster than library default 'csv'. You can
|
6
|
+
also randomly access to the elements (records).
|
7
|
+
|
8
|
+
This gem may be useful if you use only very small part (fg. 1 %) of records
|
9
|
+
in big CSV.
|
10
|
+
|
11
|
+
|
12
|
+
## 1. Benchmark
|
13
|
+
|
14
|
+
```
|
15
|
+
$ bundle exec rake bm
|
16
|
+
|
17
|
+
# of records
|
18
|
+
csv: 40000
|
19
|
+
podcsv: 40000
|
20
|
+
|
21
|
+
Benchmark
|
22
|
+
read:
|
23
|
+
Rehearsal --------------------------------------------
|
24
|
+
csv 4.780000 0.090000 4.870000 ( 5.135495)
|
25
|
+
podcsv 0.270000 0.040000 0.310000 ( 0.319921)
|
26
|
+
----------------------------------- total: 5.180000sec
|
27
|
+
|
28
|
+
user system total real
|
29
|
+
csv 4.650000 0.070000 4.720000 ( 5.041622)
|
30
|
+
podcsv 0.240000 0.030000 0.270000 ( 0.272924)
|
31
|
+
|
32
|
+
access:
|
33
|
+
Rehearsal --------------------------------------------
|
34
|
+
csv 4.620000 0.060000 4.680000 ( 4.919373)
|
35
|
+
podcsv 0.400000 0.030000 0.430000 ( 0.460540)
|
36
|
+
----------------------------------- total: 5.110000sec
|
37
|
+
|
38
|
+
user system total real
|
39
|
+
csv 4.660000 0.100000 4.760000 ( 4.921194)
|
40
|
+
podcsv 0.360000 0.020000 0.380000 ( 0.399690)
|
41
|
+
```
|
42
|
+
|
43
|
+
### 1.1. Trick
|
44
|
+
|
45
|
+
This gem defines two classes: `PodCSV` and `PodArray`.
|
46
|
+
PodCSV does not parse any strings on reading CSV file and just returns an
|
47
|
+
array (PodArray).
|
48
|
+
|
49
|
+
When you access elements (records) of that array via `[]`, `each`, etc.,
|
50
|
+
strings are parsed and changed into fields (PodArray cache mechanism).
|
51
|
+
|
52
|
+
|
53
|
+
## 2. Installation
|
54
|
+
|
55
|
+
Add this line to your application's Gemfile:
|
56
|
+
|
57
|
+
```ruby
|
58
|
+
gem 'podcsv'
|
59
|
+
```
|
60
|
+
|
61
|
+
And then execute:
|
62
|
+
|
63
|
+
$ bundle
|
64
|
+
|
65
|
+
Or install it yourself as:
|
66
|
+
|
67
|
+
$ gem install podcsv
|
68
|
+
|
69
|
+
|
70
|
+
## 3. Usage
|
71
|
+
|
72
|
+
### (1) Load CSV
|
73
|
+
|
74
|
+
Same as `CSV.read`.
|
75
|
+
|
76
|
+
```
|
77
|
+
ret = PodCSV.read( file [, opt_file] )
|
78
|
+
```
|
79
|
+
|
80
|
+
### (2) Access records
|
81
|
+
|
82
|
+
Same as Array: `[]`, `each`, `first, last`, etc.
|
83
|
+
|
84
|
+
```
|
85
|
+
ret = PodCSV.read( file [, opt_file] )
|
86
|
+
puts ret[-1]
|
87
|
+
```
|
88
|
+
|
89
|
+
|
90
|
+
### (3) Custom Line Parser
|
91
|
+
|
92
|
+
```
|
93
|
+
ary = PodCSV.read( file, {},
|
94
|
+
lambda{|s| s.split(/"/) } )
|
95
|
+
```
|
96
|
+
|
97
|
+
|
98
|
+
## 4. Development
|
99
|
+
|
100
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake rspec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
101
|
+
|
102
|
+
|
103
|
+
## 5. Contributing
|
104
|
+
|
105
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/mephistobooks/podcsv.
|
106
|
+
|
107
|
+
|
108
|
+
## 6. License
|
109
|
+
|
110
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
111
|
+
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "podcsv"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/lib/podcsv.rb
ADDED
@@ -0,0 +1,212 @@
|
|
1
|
+
#
|
2
|
+
#
|
3
|
+
#
|
4
|
+
require 'csv'
|
5
|
+
|
6
|
+
|
7
|
+
class PodArray < Array
|
8
|
+
|
9
|
+
#PARSER_DEFAULT = lambda{|s,opt| CSV.parse_line(s,opt)}
|
10
|
+
#PARSER_DEFAULT = lambda{|s| CSV.parse_line(s,@opt)}
|
11
|
+
#PARSER_DEFAULT = lambda{|s| CSV.parse_line(s,@opt.to_h)}
|
12
|
+
|
13
|
+
def initialize(args)
|
14
|
+
super
|
15
|
+
|
16
|
+
# we can also distinguish which are cached by the type of elements
|
17
|
+
# (String or not).
|
18
|
+
# But the advantage of having cache structure is that
|
19
|
+
# we can know which are the cached elements without traversing
|
20
|
+
# whole array.
|
21
|
+
#
|
22
|
+
@_cache = {}
|
23
|
+
if args.class == self.class
|
24
|
+
@_cache = args._cache
|
25
|
+
end
|
26
|
+
|
27
|
+
# opt for CSV.parse.
|
28
|
+
# default is {}, which means that dont change the default behaviour
|
29
|
+
# of CSV.parse_line.
|
30
|
+
#@opt = {}
|
31
|
+
@_lazy_parser_default = lambda{|s| CSV.parse_line(s)}
|
32
|
+
|
33
|
+
#
|
34
|
+
#
|
35
|
+
#
|
36
|
+
#@_lazy_parser = lambda{|x,o| CSV.parse(x,o).first}
|
37
|
+
@_lazy_parser = @_lazy_parser_default
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
def cached?(i)
|
42
|
+
not(@_cache[i].nil?)
|
43
|
+
end
|
44
|
+
attr_reader :_cache
|
45
|
+
attr_reader :_lazy_parser_default
|
46
|
+
attr_accessor :_lazy_parser
|
47
|
+
#attr_accessor :opt
|
48
|
+
|
49
|
+
#
|
50
|
+
def [](args)
|
51
|
+
ret = nil
|
52
|
+
|
53
|
+
# .
|
54
|
+
if args.is_a?(Integer)
|
55
|
+
tmp = super
|
56
|
+
|
57
|
+
# cache function.
|
58
|
+
if self.cached?(args)
|
59
|
+
|
60
|
+
# do nothing.
|
61
|
+
|
62
|
+
else
|
63
|
+
#tmp = super
|
64
|
+
#$stderr.puts "*tmp (class:#{tmp.class}):#{tmp}"
|
65
|
+
if tmp.class == String
|
66
|
+
|
67
|
+
#@_cache[args] = @_lazy_parser.call(tmp, @opt)
|
68
|
+
@_cache[args] = @_lazy_parser.call(tmp)
|
69
|
+
self[args] = @_cache[args]
|
70
|
+
|
71
|
+
else
|
72
|
+
@_cache[args] = tmp unless self.cached?(args)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
#$stderr.puts "_cache[#{args}]: #{@_cache[args]}"
|
77
|
+
@_cache[args] # return the value of self[args].
|
78
|
+
#self[args]
|
79
|
+
#self[args] = @_cache[args]
|
80
|
+
else
|
81
|
+
# Range, etc.
|
82
|
+
# $stderr.puts "[INFO] access for #{args} (#{args.class})."
|
83
|
+
|
84
|
+
# copy partial array.
|
85
|
+
idxs = Array(args)
|
86
|
+
pary = idxs.map{|ii| self[ii]}
|
87
|
+
ret = PodArray.new(
|
88
|
+
# Integer
|
89
|
+
pary
|
90
|
+
)
|
91
|
+
|
92
|
+
# copy partial cache.
|
93
|
+
ini_cache = self._cache
|
94
|
+
idxs.each {|ii|
|
95
|
+
ret.instance_eval{ @_cache[ii] = ini_cache[ii] }
|
96
|
+
}
|
97
|
+
|
98
|
+
ret
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def index(x)
|
103
|
+
#$stderr.puts "#{self.class}##{__method__} (#{x.class})"
|
104
|
+
self[x]
|
105
|
+
end
|
106
|
+
|
107
|
+
def first
|
108
|
+
#$stderr.puts "#{self.class}##{__method__} ()"
|
109
|
+
# self.index(0)
|
110
|
+
self[0]
|
111
|
+
end
|
112
|
+
|
113
|
+
def last
|
114
|
+
#$stderr.puts "#{self.class}##{__method__} ()"
|
115
|
+
self[self.size-1]
|
116
|
+
end
|
117
|
+
|
118
|
+
def reverse
|
119
|
+
#$stderr.puts "#{self.class}##{__method__} ()"
|
120
|
+
ret = PodArray.new(super)
|
121
|
+
#$stderr.puts "#{self.class}##{__method__} ret: #{ret.class}"
|
122
|
+
#$stderr.puts "#{self.class}##{__method__} ret.first: #{ret.first.class}"
|
123
|
+
|
124
|
+
ret
|
125
|
+
end
|
126
|
+
|
127
|
+
#
|
128
|
+
def each
|
129
|
+
s = self
|
130
|
+
a = Array(s.to_a) # cast to Array.
|
131
|
+
|
132
|
+
#$stderr.puts "s: #{s.class}"
|
133
|
+
#$stderr.puts "a: #{a.class}"
|
134
|
+
#pp a
|
135
|
+
|
136
|
+
# do cache.
|
137
|
+
a.each.with_index {|_,i|
|
138
|
+
# $stderr.puts "i: #{i}"
|
139
|
+
self[i] # do cache.
|
140
|
+
yield(self[i]) if block_given?
|
141
|
+
}
|
142
|
+
|
143
|
+
# return
|
144
|
+
if block_given?
|
145
|
+
self
|
146
|
+
else
|
147
|
+
self.to_enum
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def map
|
152
|
+
tmp = self.each
|
153
|
+
ret = []
|
154
|
+
|
155
|
+
tmp.each{|e|
|
156
|
+
ret << yield(e) if block_given?
|
157
|
+
}
|
158
|
+
|
159
|
+
# return
|
160
|
+
if block_given?
|
161
|
+
ret
|
162
|
+
else
|
163
|
+
tmp
|
164
|
+
end
|
165
|
+
|
166
|
+
end
|
167
|
+
|
168
|
+
def select
|
169
|
+
tmp = self.each
|
170
|
+
ret = []
|
171
|
+
|
172
|
+
tmp.each{|e|
|
173
|
+
# $stderr.puts "#{e} (#{e.class})"
|
174
|
+
if block_given?
|
175
|
+
ret << e if yield(e)
|
176
|
+
end
|
177
|
+
}
|
178
|
+
|
179
|
+
# return
|
180
|
+
if block_given?
|
181
|
+
PodArray.new(ret)
|
182
|
+
else
|
183
|
+
tmp
|
184
|
+
end
|
185
|
+
|
186
|
+
end
|
187
|
+
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
|
192
|
+
#
|
193
|
+
#
|
194
|
+
#
|
195
|
+
class PodCSV
|
196
|
+
|
197
|
+
#
|
198
|
+
# ==== Return
|
199
|
+
#
|
200
|
+
def self.read( file, opt_file = {}, line_parser = nil )
|
201
|
+
#@opt_csv = opt_csv
|
202
|
+
ret = PodArray.new( File.open(file,opt_file).read.split(/\n/) )
|
203
|
+
#ret.opt = @opt_csv
|
204
|
+
ret._lazy_parser = line_parser unless line_parser.nil?
|
205
|
+
ret
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
require 'podcsv/version'
|
210
|
+
|
211
|
+
|
212
|
+
####
|
data/podcsv.gemspec
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'podcsv/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "podcsv"
|
8
|
+
spec.version = PodCSV::VERSION
|
9
|
+
spec.authors = ["YAMAMOTO, Masayuki"]
|
10
|
+
spec.email = ["martin.route66.blues+github@gmail.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{Parse-on-demand CSV.}
|
13
|
+
spec.description = %q{This gem defines PodCSV and PodArray which are available to cache and parse data on-demand. These are useful when you need to read a big CSV file (around thousand records) but use very small part of it.}
|
14
|
+
spec.homepage = "https://github.com/mephistobooks/podcsv"
|
15
|
+
spec.license = "MIT"
|
16
|
+
|
17
|
+
# Prevent pushing this gem to RubyGems.org by setting 'allowed_push_host', or
|
18
|
+
# delete this section to allow pushing this gem to any host.
|
19
|
+
if spec.respond_to?(:metadata)
|
20
|
+
spec.metadata['allowed_push_host'] = "https://rubygems.org"
|
21
|
+
else
|
22
|
+
raise "RubyGems 2.0 or newer is required to protect against public gem pushes."
|
23
|
+
end
|
24
|
+
|
25
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
26
|
+
spec.bindir = "exe"
|
27
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
28
|
+
spec.require_paths = ["lib"]
|
29
|
+
|
30
|
+
spec.add_development_dependency "bundler", "~> 1.10"
|
31
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
32
|
+
spec.add_development_dependency "rspec"
|
33
|
+
end
|
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: podcsv
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- YAMAMOTO, Masayuki
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-06-10 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.10'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.10'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
description: This gem defines PodCSV and PodArray which are available to cache and
|
56
|
+
parse data on-demand. These are useful when you need to read a big CSV file (around
|
57
|
+
thousand records) but use very small part of it.
|
58
|
+
email:
|
59
|
+
- martin.route66.blues+github@gmail.com
|
60
|
+
executables: []
|
61
|
+
extensions: []
|
62
|
+
extra_rdoc_files: []
|
63
|
+
files:
|
64
|
+
- ".gitignore"
|
65
|
+
- ".rspec"
|
66
|
+
- ".travis.yml"
|
67
|
+
- Gemfile
|
68
|
+
- LICENSE.txt
|
69
|
+
- README.md
|
70
|
+
- Rakefile
|
71
|
+
- bin/console
|
72
|
+
- bin/setup
|
73
|
+
- lib/podcsv.rb
|
74
|
+
- lib/podcsv/version.rb
|
75
|
+
- podcsv.gemspec
|
76
|
+
homepage: https://github.com/mephistobooks/podcsv
|
77
|
+
licenses:
|
78
|
+
- MIT
|
79
|
+
metadata:
|
80
|
+
allowed_push_host: https://rubygems.org
|
81
|
+
post_install_message:
|
82
|
+
rdoc_options: []
|
83
|
+
require_paths:
|
84
|
+
- lib
|
85
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
+
requirements:
|
92
|
+
- - ">="
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
requirements: []
|
96
|
+
rubyforge_project:
|
97
|
+
rubygems_version: 2.4.6
|
98
|
+
signing_key:
|
99
|
+
specification_version: 4
|
100
|
+
summary: Parse-on-demand CSV.
|
101
|
+
test_files: []
|