external 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History +5 -0
- data/MIT-LICENSE +21 -0
- data/README +168 -0
- data/lib/ext_arc.rb +108 -0
- data/lib/ext_arr.rb +727 -0
- data/lib/ext_ind.rb +1120 -0
- data/lib/external/base.rb +85 -0
- data/lib/external/chunkable.rb +105 -0
- data/lib/external/enumerable.rb +137 -0
- data/lib/external/io.rb +398 -0
- data/lib/external.rb +3 -0
- data/test/benchmarks/benchmarks_20070918.txt +45 -0
- data/test/benchmarks/benchmarks_20070921.txt +91 -0
- data/test/benchmarks/benchmarks_20071006.txt +147 -0
- data/test/benchmarks/test_copy_file.rb +80 -0
- data/test/benchmarks/test_pos_speed.rb +47 -0
- data/test/benchmarks/test_read_time.rb +55 -0
- data/test/cached_ext_ind_test.rb +219 -0
- data/test/check/benchmark_check.rb +441 -0
- data/test/check/namespace_conflicts_check.rb +23 -0
- data/test/check/pack_check.rb +90 -0
- data/test/ext_arc_test.rb +286 -0
- data/test/ext_arr/alt_sep.txt +3 -0
- data/test/ext_arr/cr_lf_input.txt +3 -0
- data/test/ext_arr/input.index +0 -0
- data/test/ext_arr/input.txt +1 -0
- data/test/ext_arr/inputb.index +0 -0
- data/test/ext_arr/inputb.txt +1 -0
- data/test/ext_arr/lf_input.txt +3 -0
- data/test/ext_arr/lines.txt +19 -0
- data/test/ext_arr/without_index.txt +1 -0
- data/test/ext_arr_test.rb +534 -0
- data/test/ext_ind_test.rb +1472 -0
- data/test/external/base_test.rb +74 -0
- data/test/external/chunkable_test.rb +182 -0
- data/test/external/index/input.index +0 -0
- data/test/external/index/inputb.index +0 -0
- data/test/external/io_test.rb +414 -0
- data/test/external_test_helper.rb +31 -0
- data/test/external_test_suite.rb +4 -0
- data/test/test_array.rb +1192 -0
- metadata +104 -0
data/History
ADDED
data/MIT-LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright (c) 2006-2007, Regents of the University of Colorado.
|
2
|
+
Developer:: Simon Chiang, Biomolecular Structure Program, Hansen Lab
|
3
|
+
Support:: CU Denver School of Medicine Deans Academic Enrichment Fund
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this
|
6
|
+
software and associated documentation files (the "Software"), to deal in the Software
|
7
|
+
without restriction, including without limitation the rights to use, copy, modify, merge,
|
8
|
+
publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
|
9
|
+
to whom the Software is furnished to do so, subject to the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be included in all copies or
|
12
|
+
substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
18
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
19
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
20
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
21
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
= External
|
2
|
+
|
3
|
+
Indexing and array-like access to data stored on disk rather than in memory.
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
External provides an easy way to index files such that array-like calls can store and
|
8
|
+
retrieve entries directly from the file without loading it into memory. The indexes can
|
9
|
+
be cached for performance or stored on disk alongside the data file, in essence giving you
|
10
|
+
arbitrarily large arrays.
|
11
|
+
|
12
|
+
The main classes of external provide array-like access to the following:
|
13
|
+
* ExtInd (External Index) -- formatted binary data
|
14
|
+
* ExtArr (External Array) -- externally stored ruby objects
|
15
|
+
* ExtArc (External Archive) -- externally stored string data
|
16
|
+
|
17
|
+
ExtArc is a subclass of ExtArr specialized for string archival files, formats like FASTA
|
18
|
+
where entries are strings delimited by '>':
|
19
|
+
|
20
|
+
>Q9BXQ0|Q9BXQ0_HUMAN Tissue transglutaminase (Fragment) - Homo sapiens (Human).
|
21
|
+
LEPFSGKALCSWSIC
|
22
|
+
>P02452|CO1A1_HUMAN Collagen alpha-1(I) chain - Homo sapiens (Human).
|
23
|
+
MFSFVDLRLLLLLAATALLTHGQEEGQVEGQDEDIPPITCVQNGLRYHDRDVWKPEPCRI
|
24
|
+
CVCDNGKVLCDDVICDETKNCPGAEVPEGECCPVCPDGSESPTDQETTGVEGPKGDTGPR
|
25
|
+
GPRGPAGPPGRDGIPGQPGLPGPPGPPGPPGPPGLGGNFAPQLSYGYDEKSTGGISVPGP
|
26
|
+
...
|
27
|
+
|
28
|
+
The array-like behavior of these classes is developed against modified versions of the
|
29
|
+
Array tests themselves, and often uses the exact same tests. The idea is to eventually
|
30
|
+
duck-type all Array methods, including sort and collect, with acceptable performance.
|
31
|
+
|
32
|
+
=== Bugs/Known Issues
|
33
|
+
|
34
|
+
* only a limited set of array methods are currently supported
|
35
|
+
* reindexing of ExtArr does not work for arrays containing yaml strings
|
36
|
+
* yaml serialization/deserialization of some strings do not reproduce identical input
|
37
|
+
and so will not be faithfully store in ExtArr. Carriage return string are notable:
|
38
|
+
"\r", "\r\n", "string_with_\r\n_internal", as are chains of newlines: "\n", "\n\n"
|
39
|
+
* documentation is poor at the moment
|
40
|
+
|
41
|
+
--
|
42
|
+
== Performance
|
43
|
+
++
|
44
|
+
|
45
|
+
== Info
|
46
|
+
|
47
|
+
Copyright (c) 2006-2007, Regents of the University of Colorado.
|
48
|
+
Developer:: {Simon Chiang}[http://bahuvrihi.wordpress.com], {Biomolecular Structure Program}[http://biomol.uchsc.edu/], {Hansen Lab}[http://hsc-proteomics.uchsc.edu/hansenlab/]
|
49
|
+
Support:: CU Denver School of Medicine Deans Academic Enrichment Fund
|
50
|
+
Licence:: MIT-Style
|
51
|
+
|
52
|
+
== Installation
|
53
|
+
|
54
|
+
External is available from RubyForge[http://rubyforge.org/projects/external]. Use:
|
55
|
+
|
56
|
+
% gem install external
|
57
|
+
|
58
|
+
== Usage
|
59
|
+
|
60
|
+
=== ExtArr
|
61
|
+
|
62
|
+
ExtArr can be initialized from data using the [] operator and used as an array.
|
63
|
+
|
64
|
+
ea = ExtArr[1, 2.2, "cat", {:key => 'value'}]
|
65
|
+
ea[2] # => "cat"
|
66
|
+
ea.last # => {:key => 'value'}
|
67
|
+
ea << [:a, :b]
|
68
|
+
ea.to_a # => [1, 2.2, "cat", {:key => 'value'}, [:a, :b]]
|
69
|
+
|
70
|
+
Behind the scenes, ExtArr serializes and stores entries on a data source (io) and builds an
|
71
|
+
ExtInd that tracks where each entry begins and ends.
|
72
|
+
|
73
|
+
ea.io.class # => Tempfile
|
74
|
+
ea.io.rewind
|
75
|
+
ea.io.read # => "--- 1\n--- 2.2\n--- cat\n--- \n:key: value\n--- \n- :a\n- :b\n"
|
76
|
+
|
77
|
+
ea.index.class # => ExtInd
|
78
|
+
ea.index.to_a # => [[0, 6], [6, 8], [14, 8], [22, 17], [39, 15]]
|
79
|
+
|
80
|
+
By default External supports File, Tempfile, and StringIO data sources. If no data source is
|
81
|
+
given (as above), the external array is initialized to a Tempfile so that it will be cleaned
|
82
|
+
up on exit.
|
83
|
+
|
84
|
+
ExtArr can be initialized from existing data sources. In this case, ExtArr tries to find and
|
85
|
+
load an existing index; if the index doesn't exist, then you have to reindex the data manually.
|
86
|
+
|
87
|
+
File.open('path/to/file.txt', "w+") do |file|
|
88
|
+
file << "--- 1\n--- 2.2\n--- cat\n--- \n:key: value\n--- \n- :a\n- :b\n"
|
89
|
+
file.flush
|
90
|
+
|
91
|
+
index_filepath = ExtArr.default_index_filepath(file.path)
|
92
|
+
File.exists?(index_filepath) # => false
|
93
|
+
|
94
|
+
ea = ExtArr.new(file)
|
95
|
+
ea.to_a # => []
|
96
|
+
ea.reindex
|
97
|
+
ea.to_a # => [1, 2.2, "cat", {:key => 'value'}, [:a, :b]]
|
98
|
+
end
|
99
|
+
|
100
|
+
ExtArr provides an open method for easy access to file data:
|
101
|
+
|
102
|
+
ExtArr.open('path/to/file.txt') do |ea|
|
103
|
+
# ...
|
104
|
+
end
|
105
|
+
|
106
|
+
=== ExtArc
|
107
|
+
|
108
|
+
ExtArc is a subclass of ExtArr designed for string archival files. Rather than serialize and
|
109
|
+
load ruby objects to and from the data file, ExtArc simply read and writes strings. In
|
110
|
+
addition, ExtArc provides additional reindexing methods designed to make reindexing easy.
|
111
|
+
|
112
|
+
arc = ExtArc[">swift", ">brown", ">fox"]
|
113
|
+
arc[2] # => ">fox"
|
114
|
+
arc.to_a # => [">swift", ">brown", ">fox"]
|
115
|
+
|
116
|
+
arc.io.class # => Tempfile
|
117
|
+
arc.io.rewind
|
118
|
+
arc.io.read # => ">swift>brown>fox"
|
119
|
+
|
120
|
+
File.open('path/to/file.txt', "w+") do |file|
|
121
|
+
file << ">swift>brown>fox"
|
122
|
+
file.flush
|
123
|
+
|
124
|
+
# Reindex by a separation string
|
125
|
+
arc = ExtArc.new(file)
|
126
|
+
arc.to_a # => []
|
127
|
+
arc.reindex_by_sep(:sep_string => ">", :entry_follows_sep => true)
|
128
|
+
arc.to_a # => [">swift", ">brown", ">fox"]
|
129
|
+
|
130
|
+
# Reindex by scanning an entry
|
131
|
+
arc = ExtArc.new(file)
|
132
|
+
arc.to_a # => []
|
133
|
+
arc.reindex_by_scan(/>\w*/)
|
134
|
+
arc.to_a # => [">swift", ">brown", ">fox"]
|
135
|
+
end
|
136
|
+
|
137
|
+
=== ExtInd
|
138
|
+
|
139
|
+
ExtInd provides array-like access to formatted binary data. The index of ExtArr is an
|
140
|
+
ExtInd constructed to access data formatted as 'II'; two integers corresponding to the
|
141
|
+
start position and length of entries in the ExtArr data source. For simple, repetitive
|
142
|
+
formats like 'II', processing is optimized to use a general format and frame.
|
143
|
+
|
144
|
+
ea = ExtArr.new
|
145
|
+
ea.index.class # => ExtInd
|
146
|
+
index = ea.index
|
147
|
+
|
148
|
+
index.format # => 'I*'
|
149
|
+
index.frame # => 2
|
150
|
+
index << [1,2]
|
151
|
+
index << [3,4]
|
152
|
+
index.to_a # => [[1,2],[3,4]]
|
153
|
+
|
154
|
+
ExtInd handles arbitrary packing formats, opening many possibilites:
|
155
|
+
|
156
|
+
File.open('path/to/file', "w+") do |file|
|
157
|
+
file << [1,2,3].pack("IQS")
|
158
|
+
file << [4,5,6].pack("IQS")
|
159
|
+
file << [7,8,9].pack("IQS")
|
160
|
+
file.flush
|
161
|
+
|
162
|
+
index = ExtInd.new(file, :format => "IQS")
|
163
|
+
index[1] # => [4,5,6]
|
164
|
+
index.to_a # => [[1,2,3],[4,5,6],[7,8,9]]
|
165
|
+
end
|
166
|
+
|
167
|
+
Note: at the moment formats must be specified longhand, ie 'III' cannot be written as 'I3',
|
168
|
+
and the native size directives for sSiIlL are not supported.
|
data/lib/ext_arc.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'ext_arr'
|
2
|
+
require 'stringio'
|
3
|
+
require 'strscan'
|
4
|
+
|
5
|
+
class ExtArc < ExtArr
|
6
|
+
|
7
|
+
def str_to_entry(str)
|
8
|
+
str
|
9
|
+
end
|
10
|
+
|
11
|
+
def entry_to_str(entry)
|
12
|
+
entry.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def reindex_by_scan(pattern=/\r?\n|$/)
|
16
|
+
index_format = index.format
|
17
|
+
reindex do |index|
|
18
|
+
io.scan do |scan_pos, string|
|
19
|
+
scanner = StringScanner.new(string)
|
20
|
+
while advanced = scanner.search_full(pattern, true, false)
|
21
|
+
break unless advanced > 0
|
22
|
+
|
23
|
+
index.unframed_write [scan_pos, advanced]
|
24
|
+
scan_pos += advanced
|
25
|
+
end
|
26
|
+
|
27
|
+
scanner.restsize
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def reindex_by_sep(options={})
|
33
|
+
options = {
|
34
|
+
:sep_string => $/,
|
35
|
+
:entry_follows_sep => false,
|
36
|
+
:exclude_sep => false
|
37
|
+
}.merge(options)
|
38
|
+
|
39
|
+
sep_string = options[:sep_string]
|
40
|
+
entry_follows_sep = options[:entry_follows_sep]
|
41
|
+
exclude_sep = options[:exclude_sep]
|
42
|
+
sep_string_length = sep_string.length
|
43
|
+
|
44
|
+
reindex do |index|
|
45
|
+
current_pos = 0
|
46
|
+
entry_begin = 0
|
47
|
+
|
48
|
+
io_length = io.length
|
49
|
+
io.each_line(sep_string) do |line|
|
50
|
+
# Note positions MUST be built up using line.length
|
51
|
+
# io.pos cannot return positions greater than ~2.1e9
|
52
|
+
current_pos += line.length
|
53
|
+
entry_end = current_pos - (entry_follows_sep && current_pos != io_length ? 1 : 0)
|
54
|
+
|
55
|
+
unless entry_end == entry_begin
|
56
|
+
if exclude_sep
|
57
|
+
if entry_follows_sep
|
58
|
+
entry_begin += sep_string_length
|
59
|
+
else
|
60
|
+
entry_end -= sep_string_length
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
index.unframed_write [entry_begin, entry_end-entry_begin]
|
65
|
+
entry_begin = entry_end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def reindex_by_line(options={}) # :nodoc:
|
72
|
+
options = {
|
73
|
+
:sep_string => $/,
|
74
|
+
:break_before => false,
|
75
|
+
:exclude_break => false
|
76
|
+
}.merge(options)
|
77
|
+
|
78
|
+
sep_string = options[:sep_string]
|
79
|
+
break_before = options[:break_before]
|
80
|
+
exclude_break = options[:exclude_break]
|
81
|
+
|
82
|
+
reindex do |index|
|
83
|
+
last_pos = 0
|
84
|
+
current_pos = 0
|
85
|
+
range_begin = 0
|
86
|
+
|
87
|
+
io.each_line(sep_string) do |line|
|
88
|
+
# Note positions MUST be built up using line.length
|
89
|
+
# io.pos cannot return positions greater than ~2.1e9
|
90
|
+
last_pos = current_pos
|
91
|
+
current_pos += line.length
|
92
|
+
|
93
|
+
if (block_given? ? yield(line) : true)
|
94
|
+
range_end = (break_before || exclude_break) ? last_pos : current_pos
|
95
|
+
unless range_end == range_begin
|
96
|
+
index.unframed_write [range_begin, range_end-range_begin]
|
97
|
+
end
|
98
|
+
range_begin = (break_before && !exclude_break) ? last_pos : current_pos
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
range_end = current_pos
|
103
|
+
unless range_end == range_begin
|
104
|
+
index.unframed_write [range_begin, range_end-range_begin]
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|