ioblockreader 1.0.3.20130618 → 1.0.4.20130725
Sign up to get free protection for your applications and to get access to all the features.
- data/AUTHORS +1 -0
- data/ChangeLog +6 -0
- data/README.md +1 -141
- data/ReleaseInfo +1 -1
- data/lib/ioblockreader/datablock.rb +11 -1
- data/lib/ioblockreader/ioblockreader.rb +26 -11
- metadata +2 -2
data/AUTHORS
CHANGED
data/ChangeLog
CHANGED
@@ -1,5 +1,11 @@
|
|
1
1
|
= IOBlockReader Release History
|
2
2
|
|
3
|
+
== 1.0.4.20130725 (Beta)
|
4
|
+
|
5
|
+
* cached_block was not being updated correctly when blocks were invalidated.
|
6
|
+
* index with a token of length 1 returned wrong indexes when found using cross-blocks.
|
7
|
+
* Added test for cached block
|
8
|
+
|
3
9
|
== 1.0.3.20130618 (Beta)
|
4
10
|
|
5
11
|
* Added get_block_containing_offset interface
|
data/README.md
CHANGED
@@ -3,144 +3,4 @@ IOBlockReader
|
|
3
3
|
|
4
4
|
Ruby library giving block-buffered and cached read over IO objects with a String-like interface. Ideal to parse big files as Strings, limiting memory consumption.
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
``` bash
|
9
|
-
gem install ioblockreader
|
10
|
-
```
|
11
|
-
|
12
|
-
## Usage
|
13
|
-
|
14
|
-
``` ruby
|
15
|
-
# Require the library
|
16
|
-
require 'ioblockreader'
|
17
|
-
|
18
|
-
# Open an IO
|
19
|
-
File.open('my_big_file', 'rb') do |file|
|
20
|
-
|
21
|
-
# Get an IOBlockReader on it
|
22
|
-
content = IOBlockReader.init(file)
|
23
|
-
|
24
|
-
# Access it directy
|
25
|
-
puts "Content: " + content[10..20]
|
26
|
-
|
27
|
-
# Perform a search
|
28
|
-
puts "Search 0123: " + content.index('0123')
|
29
|
-
|
30
|
-
end
|
31
|
-
```
|
32
|
-
|
33
|
-
## API
|
34
|
-
|
35
|
-
### IOBlockReader.init(io, options = {})
|
36
|
-
|
37
|
-
Get an IOBlockReader instance on an IO.
|
38
|
-
|
39
|
-
Parameters:
|
40
|
-
* **io** ( _IO_ ): The IO object used to give the String interface
|
41
|
-
* **options** (<em>map< Symbol, Object ></em>): Additional options:
|
42
|
-
* **:block_size** ( _Fixnum_ ): The block size in bytes used internally. [default = 268435456]
|
43
|
-
* **:blocks_in_memory** ( _Fixnum_ ): Maximal number of blocks in memory. If it is required to load more blocks than this value for a single operation, this value is ignored. [default = 2]
|
44
|
-
|
45
|
-
Result:
|
46
|
-
* _IOBlockReader_: The IO Block Reader ready for use
|
47
|
-
|
48
|
-
Example:
|
49
|
-
```
|
50
|
-
content = IOBlockReader.init(file, :block_size => 32768, :blocks_in_memory => 5)
|
51
|
-
```
|
52
|
-
|
53
|
-
### IOBlockReader#\[\](range)
|
54
|
-
|
55
|
-
Access a part of the data in the IO as a String.
|
56
|
-
|
57
|
-
Parameters:
|
58
|
-
* **range** ( _Fixnum_ or _Range_ ): Range to extract
|
59
|
-
|
60
|
-
Result:
|
61
|
-
* _String_: The resulting data
|
62
|
-
|
63
|
-
Example:
|
64
|
-
```
|
65
|
-
single_char = content[10]
|
66
|
-
substring = content[10..20]
|
67
|
-
```
|
68
|
-
|
69
|
-
### IOBlockReader#index(token, offset = 0, max_size_regexp = 32)
|
70
|
-
|
71
|
-
Search for a token or a list of tokens.
|
72
|
-
|
73
|
-
Parameters:
|
74
|
-
* **token** ( _String_ , _Regexp_ or <em>list< Object ></em>): Token to be found. Can be a list of tokens.
|
75
|
-
* **offset** ( _Fixnum_ ): Offset starting the search [optional = 0]
|
76
|
-
* **max_size_regexp** ( _Fixnum_ ): Maximal number of characters the match should take in case of a Regexp token. Ignored if token is a String. [optional = 32]
|
77
|
-
|
78
|
-
Result:
|
79
|
-
* _Fixnum_: Index of the token (or the first one found from the given token list), or nil if none found.
|
80
|
-
* _Fixnum_: In case token was an Array, return the index of the matching token in the array, or nil if none found.
|
81
|
-
|
82
|
-
Example:
|
83
|
-
```
|
84
|
-
# Simple string search
|
85
|
-
i = content.index('search string')
|
86
|
-
|
87
|
-
# Simple string search from a given offset
|
88
|
-
i = content.index('search string', 20)
|
89
|
-
|
90
|
-
# Regexp search: have to specify the maximal token length
|
91
|
-
i = content.index(/search \d words/, 0, 14)
|
92
|
-
|
93
|
-
# Regexp search from a given offset
|
94
|
-
i = content.index(/search \d words/, 20, 14)
|
95
|
-
|
96
|
-
# Search for multiple strings at once: will stop on the first one encountered
|
97
|
-
i, token_index = content.index( [ 'search string', 'another string' ] )
|
98
|
-
|
99
|
-
# Search for multiple tokens at once from a given offset: don't forget token length if using Regexp
|
100
|
-
i, token_index = content.index( [ 'search string', /another f.....g string/ ], 20, 22)
|
101
|
-
```
|
102
|
-
|
103
|
-
### IOBlockReader#each_block(range = 0)
|
104
|
-
|
105
|
-
Iterate over blocks in the data.
|
106
|
-
|
107
|
-
Parameters:
|
108
|
-
* **range** ( _Range_ or _Fixnum_ ): The boundaries of the iteration, or the starting index [default = 0]
|
109
|
-
* _Block_ : Code called for each block encountered
|
110
|
-
* Parameters:
|
111
|
-
* **data** ( _String_ ): The data
|
112
|
-
|
113
|
-
Example:
|
114
|
-
```
|
115
|
-
# Iterate all over the IO
|
116
|
-
content.each_block do |data|
|
117
|
-
puts "Got a block of #{data.size} bytes"
|
118
|
-
end
|
119
|
-
|
120
|
-
# Iterate on just a part
|
121
|
-
content.each_block(10..50) do |data|
|
122
|
-
puts "Got a block of #{data.size} bytes"
|
123
|
-
end
|
124
|
-
|
125
|
-
```
|
126
|
-
|
127
|
-
### IOBlockReader#get_block_containing_offset(offset = 0)
|
128
|
-
|
129
|
-
Get the block containing a given offset.
|
130
|
-
This method is mainly used to provide some low-level access for processes needing great parsing performance.
|
131
|
-
|
132
|
-
Parameters:
|
133
|
-
* **offset** ( _Fixnum_ ): The offset to be accessed [default = 0]
|
134
|
-
Return:
|
135
|
-
* _String_ : The block of data containing this offset
|
136
|
-
* _Fixnum_ : The beginning offset of this data block
|
137
|
-
* _Boolean_ : Is this block the last one?
|
138
|
-
|
139
|
-
Example:
|
140
|
-
```
|
141
|
-
str_data, begin_offset, last_one = content.get_block_containing_offset(20)
|
142
|
-
```
|
143
|
-
|
144
|
-
## Contact
|
145
|
-
|
146
|
-
Want to contribute? Have any questions? [Contact Muriel!](muriel@x-aeon.com)
|
6
|
+
[See its documentation here.](http://ioblockreader.sourceforge.net)
|
data/ReleaseInfo
CHANGED
@@ -39,9 +39,10 @@ module IOBlockReader
|
|
39
39
|
@offset = offset
|
40
40
|
@last_access_time = @@access_time_sequence
|
41
41
|
@@access_time_sequence += 1
|
42
|
-
#puts "[IOBlockReader] - Read #{size} @#{@offset}"
|
42
|
+
#puts "[IOBlockReader] - Read #{size} bytes @#{@offset} in datablock ##{self.object_id}"
|
43
43
|
@io.seek(@offset)
|
44
44
|
@io.read(size, @data)
|
45
|
+
#puts "[IOBlockReader] - Data read: #{@data.inspect}"
|
45
46
|
@last_block = @io.eof?
|
46
47
|
end
|
47
48
|
|
@@ -59,6 +60,15 @@ module IOBlockReader
|
|
59
60
|
@@access_time_sequence += 1
|
60
61
|
end
|
61
62
|
|
63
|
+
# Get a string representation of this block.
|
64
|
+
# This is mainly used for debugging purposes.
|
65
|
+
#
|
66
|
+
# Result::
|
67
|
+
# * _String_: String representation
|
68
|
+
def to_s
|
69
|
+
return "[##{self.object_id}: @#{@offset} (last access: #{@last_access_time})#{@last_block ? ' (last block)' : ''}]"
|
70
|
+
end
|
71
|
+
|
62
72
|
end
|
63
73
|
|
64
74
|
end
|
@@ -35,6 +35,7 @@ module IOBlockReader
|
|
35
35
|
# * _String_: The resulting data
|
36
36
|
def [](range)
|
37
37
|
#puts "[IOBlockReader] - [](#{range.inspect})"
|
38
|
+
#display_current_blocks
|
38
39
|
if (range.is_a?(Fixnum))
|
39
40
|
# Use the cache if possible
|
40
41
|
return @cached_block.data[range - @cached_block.offset] if ((@cached_block != nil) and (range >= @cached_block.offset) and (range < @cached_block_end_offset))
|
@@ -94,7 +95,7 @@ module IOBlockReader
|
|
94
95
|
# Warning: The token(s) to be found have to be smaller than the block size given to the constructor, otherwise they won't be found (you've been warned!). If you really need to search for tokens bigger than block size, extract the data using [] operator first, and then use index on it ; it will however make a complete copy of the data in memory prior to searching tokens.
|
95
96
|
#
|
96
97
|
# Parameters::
|
97
|
-
# * *token* (_String_, _Regexp_ or <em>list<Object></em>): Token to be found. Can be a list of tokens.
|
98
|
+
# * *token* (_String_, _Regexp_ or <em>list<Object></em>): Token to be found. Can be a list of tokens. Please note than using a list of tokens is slower than using a single Regexp.
|
98
99
|
# * *offset* (_Fixnum_): Offset starting the search [optional = 0]
|
99
100
|
# * *max_size_regexp* (_Fixnum_): Maximal number of characters the match should take in case of a Regexp token. Ignored if token is a String. [optional = 32]
|
100
101
|
# Result::
|
@@ -143,6 +144,7 @@ module IOBlockReader
|
|
143
144
|
# Loop on subsequent blocks to search for token
|
144
145
|
result = nil
|
145
146
|
while ((result == nil) and (!current_block.last_block?))
|
147
|
+
#puts "[IOBlockReader] - index(#{token.inspect}, #{offset}, #{max_size_regexp}) - No find in last block #{current_block}. Continuing..."
|
146
148
|
# Check that next block is loaded
|
147
149
|
if ((next_block = @blocks[current_block_index+1]) == nil)
|
148
150
|
read_needed_blocks([current_block_index+1], current_block_index+1, current_block_index+1)
|
@@ -150,20 +152,24 @@ module IOBlockReader
|
|
150
152
|
else
|
151
153
|
next_block.touch
|
152
154
|
end
|
153
|
-
# Get data across the 2 blocks: enough to search for token_size data only
|
154
|
-
|
155
|
-
|
156
|
-
token.
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
155
|
+
# Get data across the 2 blocks if needed: enough to search for token_size data only
|
156
|
+
if (token_size > 1)
|
157
|
+
cross_data = current_block.data[1-token_size..-1] + next_block.data[0..token_size-2]
|
158
|
+
#puts "[IOBlockReader] - index(#{token.inspect}, #{offset}, #{max_size_regexp}) - Find token in cross data: #{cross_data.inspect}..."
|
159
|
+
if token_is_array
|
160
|
+
token.each_with_index do |token2, idx|
|
161
|
+
index_token2_in_block = cross_data.index(token2)
|
162
|
+
if (index_token2_in_block != nil) and ((index_in_block == nil) or (index_token2_in_block < index_in_block))
|
163
|
+
index_in_block = index_token2_in_block
|
164
|
+
index_matching_token = idx
|
165
|
+
end
|
161
166
|
end
|
167
|
+
else
|
168
|
+
index_in_block = cross_data.index(token)
|
162
169
|
end
|
163
|
-
else
|
164
|
-
index_in_block = cross_data.index(token)
|
165
170
|
end
|
166
171
|
if (index_in_block == nil)
|
172
|
+
#puts "[IOBlockReader] - index(#{token.inspect}, #{offset}, #{max_size_regexp}) - No find in cross blocks #{current_block} / #{next_block}. Continuing..." if (token_size > 1)
|
167
173
|
# Search in the next block
|
168
174
|
if token_is_array
|
169
175
|
token.each_with_index do |token2, idx|
|
@@ -302,6 +308,7 @@ module IOBlockReader
|
|
302
308
|
# Parameters::
|
303
309
|
# * *block* (_DataBlock_): Block to be cached
|
304
310
|
def set_cache_block(block)
|
311
|
+
#puts "[IOBlockReader] - Set cached block to offset #{block.offset}"
|
305
312
|
@cached_block = block
|
306
313
|
@cached_block_end_offset = block.offset + @block_size
|
307
314
|
end
|
@@ -344,10 +351,18 @@ module IOBlockReader
|
|
344
351
|
block_to_fill = removed_blocks.pop
|
345
352
|
block_to_fill = DataBlock.new(@io) if (block_to_fill == nil)
|
346
353
|
block_to_fill.fill(block_index * @block_size, @block_size)
|
354
|
+
# Update the cached block end offset if it was modified
|
355
|
+
@cached_block_end_offset = block_to_fill.offset + @block_size if (block_to_fill == @cached_block)
|
347
356
|
@blocks[block_index] = block_to_fill
|
348
357
|
end
|
349
358
|
end
|
350
359
|
|
360
|
+
# Display current blocks
|
361
|
+
def display_current_blocks
|
362
|
+
puts "[IOBlockReader] - #{@blocks.size} blocks: #{@blocks.map { |block| (block == nil) ? '[nil]' : block }.join(' ')}"
|
363
|
+
puts "[IOBlockReader] - Cached block: #{(@cached_block == nil) ? '[nil]' : @cached_block } - End: #{@cached_block_end_offset}"
|
364
|
+
end
|
365
|
+
|
351
366
|
end
|
352
367
|
|
353
368
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ioblockreader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.4.20130725
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-07-25 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: Ruby library giving block-buffered and cached read over IO objects with
|
15
15
|
a String-like interface. Ideal to parse big files as Strings, limiting memory consumption.
|