bahuvrihi-external 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History +17 -0
- data/MIT-LICENSE +19 -0
- data/README +203 -0
- data/lib/external.rb +2 -0
- data/lib/external/base.rb +217 -0
- data/lib/external/chunkable.rb +131 -0
- data/lib/external/enumerable.rb +182 -0
- data/lib/external/io.rb +163 -0
- data/lib/external/patches/ruby_1_8_io.rb +31 -0
- data/lib/external/patches/windows_io.rb +53 -0
- data/lib/external/patches/windows_utils.rb +27 -0
- data/lib/external/utils.rb +156 -0
- data/lib/external_archive.rb +846 -0
- data/lib/external_array.rb +57 -0
- data/lib/external_index.rb +1053 -0
- metadata +88 -0
data/History
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
== 0.3.1 / 2009-06-29
|
2
|
+
|
3
|
+
* updates to gem dependencies
|
4
|
+
* experimentally added IO as an accepted type in Utils
|
5
|
+
|
6
|
+
== 0.3.0 / 2008-10-27
|
7
|
+
|
8
|
+
Major update with refactoring (ex ExtArr is now ExternalArray)
|
9
|
+
and greatly expanded testing. [] and []= methods all Externals
|
10
|
+
now comply with the Array specification in RubySpec[rubyspec.org].
|
11
|
+
Implementation of other methods is under way.
|
12
|
+
|
13
|
+
== 0.1.0 / 2007-12-10 revision 23
|
14
|
+
|
15
|
+
Initial release with working [] and []= methods
|
16
|
+
and several basic array functions for ExtInd,
|
17
|
+
ExtArr, and ExtArc.
|
data/MIT-LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2006-2008, Regents of the University of Colorado.
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this
|
4
|
+
software and associated documentation files (the "Software"), to deal in the Software
|
5
|
+
without restriction, including without limitation the rights to use, copy, modify, merge,
|
6
|
+
publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
|
7
|
+
to whom the Software is furnished to do so, subject to the following conditions:
|
8
|
+
|
9
|
+
The above copyright notice and this permission notice shall be included in all copies or
|
10
|
+
substantial portions of the Software.
|
11
|
+
|
12
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
13
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
14
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
15
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
16
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
17
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
18
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
19
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,203 @@
|
|
1
|
+
= External
|
2
|
+
|
3
|
+
Indexing and array-like access to data stored on disk rather than in memory.
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
External provides a way to index and access array data directly from a file
|
8
|
+
without loading it into memory. Indexes may be cached in memory or stored
|
9
|
+
on disk with the data file, in essence giving you arbitrarily large arrays.
|
10
|
+
Externals automatically chunk and buffer methods like <tt>each</tt> so that
|
11
|
+
the memory footprint remains low even during enumeration.
|
12
|
+
|
13
|
+
The main External classes are:
|
14
|
+
|
15
|
+
* ExternalIndex -- for formatted binary data
|
16
|
+
* ExternalArchive -- for string data
|
17
|
+
* ExternalArray -- for objects (serialized as YAML)
|
18
|
+
|
19
|
+
The array-like behavior of these classes is developed using modified versions
|
20
|
+
of the RubySpec[http://rubyspec.org] specification for Array. The idea is to
|
21
|
+
eventually duck-type all Array methods, including sort and collect, with
|
22
|
+
acceptable performance.
|
23
|
+
|
24
|
+
* Rubyforge[http://rubyforge.org/projects/external]
|
25
|
+
* Lighthouse[http://bahuvrihi.lighthouseapp.com/projects/10590-external]
|
26
|
+
* Github[http://github.com/bahuvrihi/external/tree/master]
|
27
|
+
|
28
|
+
==== Bugs/Known Issues
|
29
|
+
|
30
|
+
* only a limited set of array methods are currently supported
|
31
|
+
* currently only [] and []= are fully tested vs RubySpec
|
32
|
+
* documentation is patchy
|
33
|
+
|
34
|
+
Note also that YAML dump/load of some objects doesn't work or doesn't
|
35
|
+
reproduce the object; such objects will not be properly stored in an
|
36
|
+
ExternalArray. Problematic objects include:
|
37
|
+
|
38
|
+
Proc and Class:
|
39
|
+
|
40
|
+
block = lambda {}
|
41
|
+
YAML.load(YAML.dump(block)) # !> TypeError: allocator undefined for Proc
|
42
|
+
YAML.dump(Object) # !> TypeError: can't dump anonymous class Class
|
43
|
+
|
44
|
+
Carriage returns ("\r"):
|
45
|
+
|
46
|
+
YAML.load(YAML.dump("\r")) # => nil
|
47
|
+
YAML.load(YAML.dump("\r\n")) # => ""
|
48
|
+
YAML.load(YAML.dump("string with \r\n inside")) # => "string with \n inside"
|
49
|
+
|
50
|
+
Chains of newlines ("\n"):
|
51
|
+
|
52
|
+
YAML.load(YAML.dump("\n")) # => ""
|
53
|
+
YAML.load(YAML.dump("\n\n")) # => ""
|
54
|
+
|
55
|
+
DateTime is loaded as Time:
|
56
|
+
|
57
|
+
YAML.load(YAML.dump(DateTime.now)).class # => Time
|
58
|
+
|
59
|
+
== Usage
|
60
|
+
|
61
|
+
=== ExternalArray
|
62
|
+
|
63
|
+
ExternalArray can be initialized from data using the [] operator and used like
|
64
|
+
an array.
|
65
|
+
|
66
|
+
a = ExternalArray['str', {'key' => 'value'}]
|
67
|
+
a[0] # => 'str'
|
68
|
+
a.last # => {'key' => 'value'}
|
69
|
+
a << [1,2]; a.to_a # => ['str', {'key' => 'value'}, [1,2]]
|
70
|
+
|
71
|
+
ExternalArray serializes and stores entries to an io while building an io_index
|
72
|
+
that tracks the start and length of each entry. By default ExternalArray
|
73
|
+
will serialize to a Tempfile and use an Array as the io_index:
|
74
|
+
|
75
|
+
a.io.class # => Tempfile
|
76
|
+
a.io.rewind; a.io.read # => "--- str\n--- \nkey: value\n--- \n- 1\n- 2\n"
|
77
|
+
a.io_index.class # => Array
|
78
|
+
a.io_index.to_a # => [[0, 8], [8, 16], [24, 13]]
|
79
|
+
|
80
|
+
To save this data more permanently, provide a path to <tt>close</tt>; the tempfile
|
81
|
+
is moved to the path and a binary index file will be created:
|
82
|
+
|
83
|
+
a.close('example.yml')
|
84
|
+
File.read('example.yml') # => "--- str\n--- \nkey: value\n--- \n- 1\n- 2\n"
|
85
|
+
|
86
|
+
index = File.read('example.index')
|
87
|
+
index.unpack('I*') # => [0, 8, 8, 16, 24, 13]
|
88
|
+
|
89
|
+
ExternalArray provides <tt>open</tt> to create ExternalArrays from an existing
|
90
|
+
file; the instance will use an index file if it exists and automatically
|
91
|
+
reindex the data if it does not. Manual calls to reindex may be necessary when
|
92
|
+
you initialize an ExternalArray with <tt>new</tt> instead of <tt>open</tt>:
|
93
|
+
|
94
|
+
# use of an existing index file
|
95
|
+
ExternalArray.open('example.yml') do |b|
|
96
|
+
File.basename(b.io_index.io.path) # => 'example.index'
|
97
|
+
b.to_a # => ['str', {'key' => 'value'}, [1,2]]
|
98
|
+
end
|
99
|
+
|
100
|
+
# automatic reindexing
|
101
|
+
FileUtils.rm('example.index')
|
102
|
+
ExternalArray.open('example.yml') do |b|
|
103
|
+
b.to_a # => ['str', {'key' => 'value'}, [1,2]]
|
104
|
+
end
|
105
|
+
|
106
|
+
# manual reindexing
|
107
|
+
file = File.open('example.yml')
|
108
|
+
c = ExternalArray.new(file)
|
109
|
+
|
110
|
+
c.to_a # => []
|
111
|
+
c.reindex
|
112
|
+
c.to_a # => ['str', {'key' => 'value'}, [1,2]]
|
113
|
+
|
114
|
+
=== ExternalArchive
|
115
|
+
|
116
|
+
ExternalArchive is exactly like ExternalArray except that it only stores
|
117
|
+
strings (ExternalArray is actually a subclass of ExternalArchive which
|
118
|
+
dumps/loads strings).
|
119
|
+
|
120
|
+
arc = ExternalArchive["swift", "brown", "fox"]
|
121
|
+
arc[2] # => "fox"
|
122
|
+
arc.to_a # => ["swift", "brown", "fox"]
|
123
|
+
arc.io.rewind; arc.io.read # => "swiftbrownfox"
|
124
|
+
|
125
|
+
ExternalArchive is useful as a base for classes to access archival data.
|
126
|
+
Here is a simple parser for FASTA[http://en.wikipedia.org/wiki/Fasta_format]
|
127
|
+
data:
|
128
|
+
|
129
|
+
# A sample FASTA entry
|
130
|
+
# >gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
|
131
|
+
# LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV
|
132
|
+
# EWIWGGFSVDKATLNRFFAFHFILPFTMVALAGVHLTFLHETGSNNPLGLTSDSDKIPFHPYYTIKDFLG
|
133
|
+
# LLILILLLLLLALLSPDMLGDPDNHMPADPLNTPLHIKPEWYFLFAYAILRSVPNKLGGVLALFLSIVIL
|
134
|
+
# GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX
|
135
|
+
# IENY
|
136
|
+
|
137
|
+
class FastaEntry
|
138
|
+
attr_reader :header, :body
|
139
|
+
|
140
|
+
def initialize(str)
|
141
|
+
@body = str.split(/\r?\n/)
|
142
|
+
@header = body.shift
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
class FastaArchive < ExternalArchive
|
147
|
+
def str_to_entry(str); FastaEntry.new(str); end
|
148
|
+
def entry_to_str(entry); ([entry.header] + entry.body).join("\n"); end
|
149
|
+
|
150
|
+
def reindex
|
151
|
+
reindex_by_sep('>', :entry_follows_sep => true)
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
require 'open-uri'
|
156
|
+
fasta = FastaArchive.new open('http://external.rubyforge.org/doc/tiny_fasta.txt')
|
157
|
+
fasta.reindex
|
158
|
+
|
159
|
+
fasta.length # => 5
|
160
|
+
fasta[0].body # => ["MEVNILAFIATTLFVLVPTAFLLIIYVKTVSQSD"]
|
161
|
+
|
162
|
+
The non-redundant {NCBI protein database}[ftp://ftp.ncbi.nih.gov/blast/db/FASTA/]
|
163
|
+
contains greater than 7 million FASTA entries in a 3.56 GB file; ExternalArchive
|
164
|
+
is targeted at files that size, where lazy loading of data and a small memory
|
165
|
+
footprint are critical.
|
166
|
+
|
167
|
+
=== ExternalIndex
|
168
|
+
|
169
|
+
ExternalIndex provides array-like access to formatted binary data. The index of an
|
170
|
+
uncached ExternalArray is an ExternalIndex configured for binary data like 'II'; two
|
171
|
+
integers corresponding to the start position and length an entry.
|
172
|
+
|
173
|
+
index = ExternalIndex[1, 2, 3, 4, 5, 6, {:format => 'II'}]
|
174
|
+
index.format # => 'I*'
|
175
|
+
index.frame # => 2
|
176
|
+
index[1] # => [3,4]
|
177
|
+
index.to_a # => [[1,2], [3,4], [5,6]]
|
178
|
+
|
179
|
+
ExternalIndex handles arbitrary packing formats, opening many possibilities:
|
180
|
+
|
181
|
+
Tempfile.new('sample.txt') do |file|
|
182
|
+
file << [1,2,3].pack("IQS")
|
183
|
+
file << [4,5,6].pack("IQS")
|
184
|
+
file << [7,8,9].pack("IQS")
|
185
|
+
file.flush
|
186
|
+
|
187
|
+
index = ExternalIndex.new(file, :format => "IQS")
|
188
|
+
index[1] # => [4,5,6]
|
189
|
+
index.to_a # => [[1,2,3], [4,5,6], [7,8,9]]
|
190
|
+
end
|
191
|
+
|
192
|
+
== Installation
|
193
|
+
|
194
|
+
External is available from RubyForge[http://rubyforge.org/projects/external]. Use:
|
195
|
+
|
196
|
+
% gem install external
|
197
|
+
|
198
|
+
== Info
|
199
|
+
|
200
|
+
Copyright (c) 2006-2008, Regents of the University of Colorado.
|
201
|
+
Developer:: {Simon Chiang}[http://bahuvrihi.wordpress.com], {Biomolecular Structure Program}[http://biomol.uchsc.edu/], {Hansen Lab}[http://hsc-proteomics.uchsc.edu/hansenlab/]
|
202
|
+
Support:: CU Denver School of Medicine Deans Academic Enrichment Fund
|
203
|
+
Licence:: {MIT-Style}[link:files/MIT-LICENSE.html]
|
data/lib/external.rb
ADDED
@@ -0,0 +1,217 @@
|
|
1
|
+
# For some inexplicable reason yaml MUST be required before
|
2
|
+
# tempfile in order for ExtArrTest::test_LSHIFT to pass.
|
3
|
+
# Otherwise it fails with 'TypeError: allocator undefined for Proc'
|
4
|
+
|
5
|
+
require 'yaml'
|
6
|
+
require 'tempfile'
|
7
|
+
|
8
|
+
require 'external/enumerable'
|
9
|
+
require 'external/io'
|
10
|
+
|
11
|
+
module External
|
12
|
+
|
13
|
+
# Base provides shared IO and Array-like methods used by ExternalArchive,
|
14
|
+
# ExternalArray, and ExternalIndex.
|
15
|
+
class Base
|
16
|
+
class << self
|
17
|
+
|
18
|
+
# Initializes an instance of self with File.open(path, mode) as an io.
|
19
|
+
# As with File.open, the instance will be passed to the block and
|
20
|
+
# closed when the block returns. If no block is given, open returns
|
21
|
+
# the new instance.
|
22
|
+
#
|
23
|
+
# Nil may be provided as an fd, in which case a Tempfile will be
|
24
|
+
# used (in which case mode gets ignored as Tempfiles always open
|
25
|
+
# in 'r+' mode).
|
26
|
+
def open(path=nil, mode="rb", *argv)
|
27
|
+
begin
|
28
|
+
io = path == nil ? nil : File.open(path, mode)
|
29
|
+
base = new(io, *argv)
|
30
|
+
rescue(Errno::ENOENT)
|
31
|
+
io.close if io
|
32
|
+
raise
|
33
|
+
end
|
34
|
+
|
35
|
+
if block_given?
|
36
|
+
begin
|
37
|
+
yield(base)
|
38
|
+
ensure
|
39
|
+
base.close
|
40
|
+
end
|
41
|
+
else
|
42
|
+
base
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
include External::Enumerable
|
48
|
+
include External::Chunkable
|
49
|
+
|
50
|
+
# The underlying io for self.
|
51
|
+
attr_reader :io
|
52
|
+
|
53
|
+
# The default tempfile basename for Base instances
|
54
|
+
# initialized without an io.
|
55
|
+
TEMPFILE_BASENAME = "external_base"
|
56
|
+
|
57
|
+
# Creates a new instance of self with the specified io. A
|
58
|
+
# nil io causes initialization with a Tempfile; a string
|
59
|
+
# io will be converted into a StringIO.
|
60
|
+
def initialize(io=nil)
|
61
|
+
self.io = case io
|
62
|
+
when nil then Tempfile.new(TEMPFILE_BASENAME)
|
63
|
+
when String then StringIO.new(io)
|
64
|
+
else io
|
65
|
+
end
|
66
|
+
|
67
|
+
@enumerate_to_a = true
|
68
|
+
end
|
69
|
+
|
70
|
+
# True if io is closed.
|
71
|
+
def closed?
|
72
|
+
io.closed?
|
73
|
+
end
|
74
|
+
|
75
|
+
# Closes io. If a path is specified, io will be dumped to it. If
|
76
|
+
# io is a File or Tempfile, the existing file is moved (not dumped)
|
77
|
+
# to path. Raises an error if path already exists and overwrite is
|
78
|
+
# not specified.
|
79
|
+
def close(path=nil, overwrite=false)
|
80
|
+
result = !io.closed?
|
81
|
+
|
82
|
+
if path
|
83
|
+
if File.exists?(path) && !overwrite
|
84
|
+
raise ArgumentError, "already exists: #{path}"
|
85
|
+
end
|
86
|
+
|
87
|
+
case io
|
88
|
+
when File, Tempfile
|
89
|
+
io.close unless io.closed?
|
90
|
+
FileUtils.move(io.path, path)
|
91
|
+
else
|
92
|
+
io.flush
|
93
|
+
io.rewind
|
94
|
+
File.open(path, "w") do |file|
|
95
|
+
file << io.read(io.default_blksize) while !io.eof?
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
io.close unless io.closed?
|
101
|
+
result
|
102
|
+
end
|
103
|
+
|
104
|
+
# Flushes the io and resets the io length. Returns self
|
105
|
+
def flush
|
106
|
+
io.flush
|
107
|
+
io.reset_length
|
108
|
+
self
|
109
|
+
end
|
110
|
+
|
111
|
+
# Returns a duplicate of self. This can be a slow operation
|
112
|
+
# as it may involve copying the full contents of one large
|
113
|
+
# file to another.
|
114
|
+
def dup
|
115
|
+
flush
|
116
|
+
another.concat(self)
|
117
|
+
end
|
118
|
+
|
119
|
+
# Returns another instance of self. Must be
|
120
|
+
# implemented in a subclass.
|
121
|
+
def another
|
122
|
+
raise NotImplementedError
|
123
|
+
end
|
124
|
+
|
125
|
+
###########################
|
126
|
+
# Array methods
|
127
|
+
###########################
|
128
|
+
|
129
|
+
# Returns true if _self_ contains no elements
|
130
|
+
def empty?
|
131
|
+
length == 0
|
132
|
+
end
|
133
|
+
|
134
|
+
def eql?(another)
|
135
|
+
self == another
|
136
|
+
end
|
137
|
+
|
138
|
+
# Returns the first n entries (default 1)
|
139
|
+
def first(n=nil)
|
140
|
+
n.nil? ? self[0] : self[0,n]
|
141
|
+
end
|
142
|
+
|
143
|
+
# Alias for []
|
144
|
+
def slice(one, two = nil)
|
145
|
+
self[one, two]
|
146
|
+
end
|
147
|
+
|
148
|
+
# Returns self.
|
149
|
+
#--
|
150
|
+
# Warning -- errors show up when this doesn't return
|
151
|
+
# an Array... however to return an array with to_ary
|
152
|
+
# may mean converting a Base into an Array for
|
153
|
+
# insertions... see/modify convert_to_ary
|
154
|
+
def to_ary
|
155
|
+
self
|
156
|
+
end
|
157
|
+
|
158
|
+
#
|
159
|
+
def inspect
|
160
|
+
"#<#{self.class}:#{object_id} #{ellipse_inspect(self)}>"
|
161
|
+
end
|
162
|
+
|
163
|
+
protected
|
164
|
+
|
165
|
+
# Sets io and extends the input io with Io.
|
166
|
+
def io=(io) # :nodoc:
|
167
|
+
io.extend Io unless io.kind_of?(Io)
|
168
|
+
@io = io
|
169
|
+
end
|
170
|
+
|
171
|
+
# converts obj to an int using the <tt>to_int</tt>
|
172
|
+
# method, if the object responds to <tt>to_int</tt>
|
173
|
+
def convert_to_int(obj) # :nodoc:
|
174
|
+
obj.respond_to?(:to_int) ? obj.to_int : obj
|
175
|
+
end
|
176
|
+
|
177
|
+
# converts obj to an array using the <tt>to_ary</tt>
|
178
|
+
# method, if the object responds to <tt>to_ary</tt>
|
179
|
+
def convert_to_ary(obj) # :nodoc:
|
180
|
+
obj == nil ? [] : obj.respond_to?(:to_ary) ? obj.to_ary : [obj]
|
181
|
+
end
|
182
|
+
|
183
|
+
# a more array-compliant version of Chunkable#split_range
|
184
|
+
def split_range(range, total=length) # :nodoc:
|
185
|
+
# split the range
|
186
|
+
start = convert_to_int(range.begin)
|
187
|
+
raise TypeError, "can't convert #{range.begin.class} into Integer" unless start.kind_of?(Integer)
|
188
|
+
start += total if start < 0
|
189
|
+
|
190
|
+
finish = convert_to_int(range.end)
|
191
|
+
raise TypeError, "can't convert #{range.end.class} into Integer" unless finish.kind_of?(Integer)
|
192
|
+
finish += total if finish < 0
|
193
|
+
|
194
|
+
length = finish - start
|
195
|
+
length -= 1 if range.exclude_end?
|
196
|
+
|
197
|
+
[start, length]
|
198
|
+
end
|
199
|
+
|
200
|
+
# helper to inspect large arrays
|
201
|
+
def ellipse_inspect(array) # :nodoc:
|
202
|
+
if array.length > 10
|
203
|
+
"[#{collect_join(array[0,5])} ... #{collect_join(array[-5,5])}] (length = #{array.length})"
|
204
|
+
else
|
205
|
+
"[#{collect_join(array.to_a)}]"
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
# another helper to inspect large arrays
|
210
|
+
def collect_join(array) # :nodoc:
|
211
|
+
array.collect do |obj|
|
212
|
+
obj.inspect
|
213
|
+
end.join(', ')
|
214
|
+
end
|
215
|
+
|
216
|
+
end
|
217
|
+
end
|