buftok 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +31 -0
- data/lib/buftok.rb +76 -0
- metadata +49 -0
data/Rakefile
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'rake'
|
2
|
+
require 'rake/rdoctask'
|
3
|
+
require 'rake/gempackagetask'
|
4
|
+
require 'spec/rake/spectask'
|
5
|
+
|
6
|
+
Spec::Rake::SpecTask.new(:spec) do |task|
|
7
|
+
task.spec_files = FileList['**/*_spec.rb']
|
8
|
+
end
|
9
|
+
|
10
|
+
Rake::RDocTask.new(:rdoc) do |task|
|
11
|
+
task.rdoc_dir = 'doc'
|
12
|
+
task.title = 'BufferedTokenizer'
|
13
|
+
task.rdoc_files.include('lib/**/*.rb')
|
14
|
+
end
|
15
|
+
|
16
|
+
spec = Gem::Specification.new do |s|
|
17
|
+
s.name = %q{buftok}
|
18
|
+
s.version = "0.1"
|
19
|
+
s.date = %q{2006-12-18}
|
20
|
+
s.summary = %q{BufferedTokenizer extracts token delimited entities from a sequence of arbitrary inputs}
|
21
|
+
s.email = %q{tony@clickcaster.com}
|
22
|
+
s.homepage = %q{http://buftok.rubyforge.org}
|
23
|
+
s.rubyforge_project = %q{buftok}
|
24
|
+
s.has_rdoc = true
|
25
|
+
s.authors = ["Tony Arcieri","Martin Emde"]
|
26
|
+
s.files = ["Rakefile", "lib", "lib/buftok.rb"]
|
27
|
+
end
|
28
|
+
|
29
|
+
Rake::GemPackageTask.new(spec) do |pkg|
|
30
|
+
pkg.need_tar = true
|
31
|
+
end
|
data/lib/buftok.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
# BufferedTokenizer - Statefully split input data by a specifiable token
|
2
|
+
# (C)2006 Tony Arcieri, Martin Emde
|
3
|
+
# Distributed under the Ruby license (http://www.ruby-lang.org/en/LICENSE.txt)
|
4
|
+
|
5
|
+
# BufferedTokenizer takes a delimiter upon instantiation, or acts line-based
|
6
|
+
# by default. It allows input to be spoon-fed from some outside source which
|
7
|
+
# receives arbitrary length datagrams which may-or-may-not contain the token
|
8
|
+
# by which entities are delimited. In this respect it's ideally paired with
|
9
|
+
# something like EventMachine (http://rubyforge.org/projects/eventmachine)
|
10
|
+
class BufferedTokenizer
|
11
|
+
# New BufferedTokenizers will operate on lines delimited by "\n" by default
|
12
|
+
# or allow you to specify any delimiter token you so choose, which will then
|
13
|
+
# be used by String#split to tokenize the input data
|
14
|
+
def initialize(delimiter = "\n")
|
15
|
+
# Store the specified delimiter
|
16
|
+
@delimiter = delimiter
|
17
|
+
|
18
|
+
# The input buffer is stored as an array. This is by far the most efficient
|
19
|
+
# approach given language constraints (in C a linked list would be a more
|
20
|
+
# appropriate data structure). Segments of input data are stored in a list
|
21
|
+
# which is only joined when a token is reached, substantially reducing the
|
22
|
+
# number of objects required for the operation.
|
23
|
+
@input = []
|
24
|
+
end
|
25
|
+
|
26
|
+
# Extract takes an arbitrary string of input data and returns an array of
|
27
|
+
# tokenized entities, provided there were any available to extract. This
|
28
|
+
# makes for easy processing of datagrams using a pattern like:
|
29
|
+
#
|
30
|
+
# tokenizer.extract(data).map { |entity| Decode(entity) }.each do ...
|
31
|
+
def extract(data)
|
32
|
+
# Extract token-delimited entities from the input string with the split command.
|
33
|
+
# There's a bit of craftiness here with the -1 parameter. Normally split would
|
34
|
+
# behave no differently regardless of if the token lies at the very end of the
|
35
|
+
# input buffer or not (i.e. a literal edge case) Specifying -1 forces split to
|
36
|
+
# return "" in this case, meaning that the last entry in the list represents a
|
37
|
+
# new segment of data where the token has not been encountered
|
38
|
+
entities = data.split @delimiter, -1
|
39
|
+
|
40
|
+
# Move the first entry in the resulting array into the input buffer. It represents
|
41
|
+
# the last segment of a token-delimited entity unless it's the only entry in the list.
|
42
|
+
@input << entities.shift
|
43
|
+
|
44
|
+
# If the resulting array from the split is empty, the token was not encountered
|
45
|
+
# (not even at the end of the buffer). Since we've encountered no token-delimited
|
46
|
+
# entities this go-around, return an empty array.
|
47
|
+
return [] if entities.empty?
|
48
|
+
|
49
|
+
# At this point, we've hit a token, or potentially multiple tokens. Now we can bring
|
50
|
+
# together all the data we've buffered from earlier calls without hitting a token,
|
51
|
+
# and add it to our list of discovered entities.
|
52
|
+
entities.unshift @input.join
|
53
|
+
|
54
|
+
# Now that we've hit a token, joined the input buffer and added it to the entities
|
55
|
+
# list, we can go ahead and clear the input buffer. All of the segments that were
|
56
|
+
# stored before the join can now be garbage collected.
|
57
|
+
@input.clear
|
58
|
+
|
59
|
+
# The last entity in the list is not token delimited, however, thanks to the -1
|
60
|
+
# passed to split. It represents the beginning of a new list of as-yet-untokenized
|
61
|
+
# data, so we add it to the start of the list.
|
62
|
+
@input << entities.pop
|
63
|
+
|
64
|
+
# Now we're left with the list of extracted token-delimited entities we wanted
|
65
|
+
# in the first place. Hooray!
|
66
|
+
entities
|
67
|
+
end
|
68
|
+
|
69
|
+
# Flush the contents of the input buffer, i.e. return the input buffer even though
|
70
|
+
# a token has not yet been encountered
|
71
|
+
def flush
|
72
|
+
buffer = @input.join
|
73
|
+
@input.clear
|
74
|
+
buffer
|
75
|
+
end
|
76
|
+
end
|
metadata
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: 1
|
4
|
+
name: buftok
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: "0.1"
|
7
|
+
date: 2006-12-18 00:00:00 -07:00
|
8
|
+
summary: BufferedTokenizer extracts token delimited entities from a sequence of arbitrary inputs
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: tony@clickcaster.com
|
12
|
+
homepage: http://buftok.rubyforge.org
|
13
|
+
rubyforge_project: buftok
|
14
|
+
description:
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Tony Arcieri
|
31
|
+
- Martin Emde
|
32
|
+
files:
|
33
|
+
- Rakefile
|
34
|
+
- lib
|
35
|
+
- lib/buftok.rb
|
36
|
+
test_files: []
|
37
|
+
|
38
|
+
rdoc_options: []
|
39
|
+
|
40
|
+
extra_rdoc_files: []
|
41
|
+
|
42
|
+
executables: []
|
43
|
+
|
44
|
+
extensions: []
|
45
|
+
|
46
|
+
requirements: []
|
47
|
+
|
48
|
+
dependencies: []
|
49
|
+
|