external 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History +5 -0
- data/MIT-LICENSE +21 -0
- data/README +168 -0
- data/lib/ext_arc.rb +108 -0
- data/lib/ext_arr.rb +727 -0
- data/lib/ext_ind.rb +1120 -0
- data/lib/external/base.rb +85 -0
- data/lib/external/chunkable.rb +105 -0
- data/lib/external/enumerable.rb +137 -0
- data/lib/external/io.rb +398 -0
- data/lib/external.rb +3 -0
- data/test/benchmarks/benchmarks_20070918.txt +45 -0
- data/test/benchmarks/benchmarks_20070921.txt +91 -0
- data/test/benchmarks/benchmarks_20071006.txt +147 -0
- data/test/benchmarks/test_copy_file.rb +80 -0
- data/test/benchmarks/test_pos_speed.rb +47 -0
- data/test/benchmarks/test_read_time.rb +55 -0
- data/test/cached_ext_ind_test.rb +219 -0
- data/test/check/benchmark_check.rb +441 -0
- data/test/check/namespace_conflicts_check.rb +23 -0
- data/test/check/pack_check.rb +90 -0
- data/test/ext_arc_test.rb +286 -0
- data/test/ext_arr/alt_sep.txt +3 -0
- data/test/ext_arr/cr_lf_input.txt +3 -0
- data/test/ext_arr/input.index +0 -0
- data/test/ext_arr/input.txt +1 -0
- data/test/ext_arr/inputb.index +0 -0
- data/test/ext_arr/inputb.txt +1 -0
- data/test/ext_arr/lf_input.txt +3 -0
- data/test/ext_arr/lines.txt +19 -0
- data/test/ext_arr/without_index.txt +1 -0
- data/test/ext_arr_test.rb +534 -0
- data/test/ext_ind_test.rb +1472 -0
- data/test/external/base_test.rb +74 -0
- data/test/external/chunkable_test.rb +182 -0
- data/test/external/index/input.index +0 -0
- data/test/external/index/inputb.index +0 -0
- data/test/external/io_test.rb +414 -0
- data/test/external_test_helper.rb +31 -0
- data/test/external_test_suite.rb +4 -0
- data/test/test_array.rb +1192 -0
- metadata +104 -0
data/History
ADDED
data/MIT-LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
Copyright (c) 2006-2007, Regents of the University of Colorado.
|
2
|
+
Developer:: Simon Chiang, Biomolecular Structure Program, Hansen Lab
|
3
|
+
Support:: CU Denver School of Medicine Deans Academic Enrichment Fund
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this
|
6
|
+
software and associated documentation files (the "Software"), to deal in the Software
|
7
|
+
without restriction, including without limitation the rights to use, copy, modify, merge,
|
8
|
+
publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
|
9
|
+
to whom the Software is furnished to do so, subject to the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be included in all copies or
|
12
|
+
substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
18
|
+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
19
|
+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
20
|
+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
21
|
+
OTHER DEALINGS IN THE SOFTWARE.
|
data/README
ADDED
@@ -0,0 +1,168 @@
|
|
1
|
+
= External
|
2
|
+
|
3
|
+
Indexing and array-like access to data stored on disk rather than in memory.
|
4
|
+
|
5
|
+
== Description
|
6
|
+
|
7
|
+
External provides an easy way to index files such that array-like calls can store and
|
8
|
+
retrieve entries directly from the file without loading it into memory. The indexes can
|
9
|
+
be cached for performance or stored on disk alongside the data file, in essence giving you
|
10
|
+
arbitrarily large arrays.
|
11
|
+
|
12
|
+
The main classes of external provide array-like access to the following:
|
13
|
+
* ExtInd (External Index) -- formatted binary data
|
14
|
+
* ExtArr (External Array) -- externally stored ruby objects
|
15
|
+
* ExtArc (External Archive) -- externally stored string data
|
16
|
+
|
17
|
+
ExtArc is a subclass of ExtArr specialized for string archival files, formats like FASTA
|
18
|
+
where entries are strings delimited by '>':
|
19
|
+
|
20
|
+
>Q9BXQ0|Q9BXQ0_HUMAN Tissue transglutaminase (Fragment) - Homo sapiens (Human).
|
21
|
+
LEPFSGKALCSWSIC
|
22
|
+
>P02452|CO1A1_HUMAN Collagen alpha-1(I) chain - Homo sapiens (Human).
|
23
|
+
MFSFVDLRLLLLLAATALLTHGQEEGQVEGQDEDIPPITCVQNGLRYHDRDVWKPEPCRI
|
24
|
+
CVCDNGKVLCDDVICDETKNCPGAEVPEGECCPVCPDGSESPTDQETTGVEGPKGDTGPR
|
25
|
+
GPRGPAGPPGRDGIPGQPGLPGPPGPPGPPGPPGLGGNFAPQLSYGYDEKSTGGISVPGP
|
26
|
+
...
|
27
|
+
|
28
|
+
The array-like behavior of these classes is developed against modified versions of the
|
29
|
+
Array tests themselves, and often uses the exact same tests. The idea is to eventually
|
30
|
+
duck-type all Array methods, including sort and collect, with acceptable performance.
|
31
|
+
|
32
|
+
=== Bugs/Known Issues
|
33
|
+
|
34
|
+
* only a limited set of array methods are currently supported
|
35
|
+
* reindexing of ExtArr does not work for arrays containing yaml strings
|
36
|
+
* yaml serialization/deserialization of some strings do not reproduce identical input
|
37
|
+
and so will not be faithfully store in ExtArr. Carriage return string are notable:
|
38
|
+
"\r", "\r\n", "string_with_\r\n_internal", as are chains of newlines: "\n", "\n\n"
|
39
|
+
* documentation is poor at the moment
|
40
|
+
|
41
|
+
--
|
42
|
+
== Performance
|
43
|
+
++
|
44
|
+
|
45
|
+
== Info
|
46
|
+
|
47
|
+
Copyright (c) 2006-2007, Regents of the University of Colorado.
|
48
|
+
Developer:: {Simon Chiang}[http://bahuvrihi.wordpress.com], {Biomolecular Structure Program}[http://biomol.uchsc.edu/], {Hansen Lab}[http://hsc-proteomics.uchsc.edu/hansenlab/]
|
49
|
+
Support:: CU Denver School of Medicine Deans Academic Enrichment Fund
|
50
|
+
Licence:: MIT-Style
|
51
|
+
|
52
|
+
== Installation
|
53
|
+
|
54
|
+
External is available from RubyForge[http://rubyforge.org/projects/external]. Use:
|
55
|
+
|
56
|
+
% gem install external
|
57
|
+
|
58
|
+
== Usage
|
59
|
+
|
60
|
+
=== ExtArr
|
61
|
+
|
62
|
+
ExtArr can be initialized from data using the [] operator and used as an array.
|
63
|
+
|
64
|
+
ea = ExtArr[1, 2.2, "cat", {:key => 'value'}]
|
65
|
+
ea[2] # => "cat"
|
66
|
+
ea.last # => {:key => 'value'}
|
67
|
+
ea << [:a, :b]
|
68
|
+
ea.to_a # => [1, 2.2, "cat", {:key => 'value'}, [:a, :b]]
|
69
|
+
|
70
|
+
Behind the scenes, ExtArr serializes and stores entries on a data source (io) and builds an
|
71
|
+
ExtInd that tracks where each entry begins and ends.
|
72
|
+
|
73
|
+
ea.io.class # => Tempfile
|
74
|
+
ea.io.rewind
|
75
|
+
ea.io.read # => "--- 1\n--- 2.2\n--- cat\n--- \n:key: value\n--- \n- :a\n- :b\n"
|
76
|
+
|
77
|
+
ea.index.class # => ExtInd
|
78
|
+
ea.index.to_a # => [[0, 6], [6, 8], [14, 8], [22, 17], [39, 15]]
|
79
|
+
|
80
|
+
By default External supports File, Tempfile, and StringIO data sources. If no data source is
|
81
|
+
given (as above), the external array is initialized to a Tempfile so that it will be cleaned
|
82
|
+
up on exit.
|
83
|
+
|
84
|
+
ExtArr can be initialized from existing data sources. In this case, ExtArr tries to find and
|
85
|
+
load an existing index; if the index doesn't exist, then you have to reindex the data manually.
|
86
|
+
|
87
|
+
File.open('path/to/file.txt', "w+") do |file|
|
88
|
+
file << "--- 1\n--- 2.2\n--- cat\n--- \n:key: value\n--- \n- :a\n- :b\n"
|
89
|
+
file.flush
|
90
|
+
|
91
|
+
index_filepath = ExtArr.default_index_filepath(file.path)
|
92
|
+
File.exists?(index_filepath) # => false
|
93
|
+
|
94
|
+
ea = ExtArr.new(file)
|
95
|
+
ea.to_a # => []
|
96
|
+
ea.reindex
|
97
|
+
ea.to_a # => [1, 2.2, "cat", {:key => 'value'}, [:a, :b]]
|
98
|
+
end
|
99
|
+
|
100
|
+
ExtArr provides an open method for easy access to file data:
|
101
|
+
|
102
|
+
ExtArr.open('path/to/file.txt') do |ea|
|
103
|
+
# ...
|
104
|
+
end
|
105
|
+
|
106
|
+
=== ExtArc
|
107
|
+
|
108
|
+
ExtArc is a subclass of ExtArr designed for string archival files. Rather than serialize and
|
109
|
+
load ruby objects to and from the data file, ExtArc simply read and writes strings. In
|
110
|
+
addition, ExtArc provides additional reindexing methods designed to make reindexing easy.
|
111
|
+
|
112
|
+
arc = ExtArc[">swift", ">brown", ">fox"]
|
113
|
+
arc[2] # => ">fox"
|
114
|
+
arc.to_a # => [">swift", ">brown", ">fox"]
|
115
|
+
|
116
|
+
arc.io.class # => Tempfile
|
117
|
+
arc.io.rewind
|
118
|
+
arc.io.read # => ">swift>brown>fox"
|
119
|
+
|
120
|
+
File.open('path/to/file.txt', "w+") do |file|
|
121
|
+
file << ">swift>brown>fox"
|
122
|
+
file.flush
|
123
|
+
|
124
|
+
# Reindex by a separation string
|
125
|
+
arc = ExtArc.new(file)
|
126
|
+
arc.to_a # => []
|
127
|
+
arc.reindex_by_sep(:sep_string => ">", :entry_follows_sep => true)
|
128
|
+
arc.to_a # => [">swift", ">brown", ">fox"]
|
129
|
+
|
130
|
+
# Reindex by scanning an entry
|
131
|
+
arc = ExtArc.new(file)
|
132
|
+
arc.to_a # => []
|
133
|
+
arc.reindex_by_scan(/>\w*/)
|
134
|
+
arc.to_a # => [">swift", ">brown", ">fox"]
|
135
|
+
end
|
136
|
+
|
137
|
+
=== ExtInd
|
138
|
+
|
139
|
+
ExtInd provides array-like access to formatted binary data. The index of ExtArr is an
|
140
|
+
ExtInd constructed to access data formatted as 'II'; two integers corresponding to the
|
141
|
+
start position and length of entries in the ExtArr data source. For simple, repetitive
|
142
|
+
formats like 'II', processing is optimized to use a general format and frame.
|
143
|
+
|
144
|
+
ea = ExtArr.new
|
145
|
+
ea.index.class # => ExtInd
|
146
|
+
index = ea.index
|
147
|
+
|
148
|
+
index.format # => 'I*'
|
149
|
+
index.frame # => 2
|
150
|
+
index << [1,2]
|
151
|
+
index << [3,4]
|
152
|
+
index.to_a # => [[1,2],[3,4]]
|
153
|
+
|
154
|
+
ExtInd handles arbitrary packing formats, opening many possibilites:
|
155
|
+
|
156
|
+
File.open('path/to/file', "w+") do |file|
|
157
|
+
file << [1,2,3].pack("IQS")
|
158
|
+
file << [4,5,6].pack("IQS")
|
159
|
+
file << [7,8,9].pack("IQS")
|
160
|
+
file.flush
|
161
|
+
|
162
|
+
index = ExtInd.new(file, :format => "IQS")
|
163
|
+
index[1] # => [4,5,6]
|
164
|
+
index.to_a # => [[1,2,3],[4,5,6],[7,8,9]]
|
165
|
+
end
|
166
|
+
|
167
|
+
Note: at the moment formats must be specified longhand, ie 'III' cannot be written as 'I3',
|
168
|
+
and the native size directives for sSiIlL are not supported.
|
data/lib/ext_arc.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
require 'ext_arr'
|
2
|
+
require 'stringio'
|
3
|
+
require 'strscan'
|
4
|
+
|
5
|
+
class ExtArc < ExtArr
|
6
|
+
|
7
|
+
def str_to_entry(str)
|
8
|
+
str
|
9
|
+
end
|
10
|
+
|
11
|
+
def entry_to_str(entry)
|
12
|
+
entry.to_s
|
13
|
+
end
|
14
|
+
|
15
|
+
def reindex_by_scan(pattern=/\r?\n|$/)
|
16
|
+
index_format = index.format
|
17
|
+
reindex do |index|
|
18
|
+
io.scan do |scan_pos, string|
|
19
|
+
scanner = StringScanner.new(string)
|
20
|
+
while advanced = scanner.search_full(pattern, true, false)
|
21
|
+
break unless advanced > 0
|
22
|
+
|
23
|
+
index.unframed_write [scan_pos, advanced]
|
24
|
+
scan_pos += advanced
|
25
|
+
end
|
26
|
+
|
27
|
+
scanner.restsize
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def reindex_by_sep(options={})
|
33
|
+
options = {
|
34
|
+
:sep_string => $/,
|
35
|
+
:entry_follows_sep => false,
|
36
|
+
:exclude_sep => false
|
37
|
+
}.merge(options)
|
38
|
+
|
39
|
+
sep_string = options[:sep_string]
|
40
|
+
entry_follows_sep = options[:entry_follows_sep]
|
41
|
+
exclude_sep = options[:exclude_sep]
|
42
|
+
sep_string_length = sep_string.length
|
43
|
+
|
44
|
+
reindex do |index|
|
45
|
+
current_pos = 0
|
46
|
+
entry_begin = 0
|
47
|
+
|
48
|
+
io_length = io.length
|
49
|
+
io.each_line(sep_string) do |line|
|
50
|
+
# Note positions MUST be built up using line.length
|
51
|
+
# io.pos cannot return positions greater than ~2.1e9
|
52
|
+
current_pos += line.length
|
53
|
+
entry_end = current_pos - (entry_follows_sep && current_pos != io_length ? 1 : 0)
|
54
|
+
|
55
|
+
unless entry_end == entry_begin
|
56
|
+
if exclude_sep
|
57
|
+
if entry_follows_sep
|
58
|
+
entry_begin += sep_string_length
|
59
|
+
else
|
60
|
+
entry_end -= sep_string_length
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
index.unframed_write [entry_begin, entry_end-entry_begin]
|
65
|
+
entry_begin = entry_end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def reindex_by_line(options={}) # :nodoc:
|
72
|
+
options = {
|
73
|
+
:sep_string => $/,
|
74
|
+
:break_before => false,
|
75
|
+
:exclude_break => false
|
76
|
+
}.merge(options)
|
77
|
+
|
78
|
+
sep_string = options[:sep_string]
|
79
|
+
break_before = options[:break_before]
|
80
|
+
exclude_break = options[:exclude_break]
|
81
|
+
|
82
|
+
reindex do |index|
|
83
|
+
last_pos = 0
|
84
|
+
current_pos = 0
|
85
|
+
range_begin = 0
|
86
|
+
|
87
|
+
io.each_line(sep_string) do |line|
|
88
|
+
# Note positions MUST be built up using line.length
|
89
|
+
# io.pos cannot return positions greater than ~2.1e9
|
90
|
+
last_pos = current_pos
|
91
|
+
current_pos += line.length
|
92
|
+
|
93
|
+
if (block_given? ? yield(line) : true)
|
94
|
+
range_end = (break_before || exclude_break) ? last_pos : current_pos
|
95
|
+
unless range_end == range_begin
|
96
|
+
index.unframed_write [range_begin, range_end-range_begin]
|
97
|
+
end
|
98
|
+
range_begin = (break_before && !exclude_break) ? last_pos : current_pos
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
range_end = current_pos
|
103
|
+
unless range_end == range_begin
|
104
|
+
index.unframed_write [range_begin, range_end-range_begin]
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|