buftok 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/Rakefile +31 -0
  2. data/lib/buftok.rb +76 -0
  3. metadata +49 -0
@@ -0,0 +1,31 @@
1
+ require 'rake'
2
+ require 'rake/rdoctask'
3
+ require 'rake/gempackagetask'
4
+ require 'spec/rake/spectask'
5
+
6
+ Spec::Rake::SpecTask.new(:spec) do |task|
7
+ task.spec_files = FileList['**/*_spec.rb']
8
+ end
9
+
10
+ Rake::RDocTask.new(:rdoc) do |task|
11
+ task.rdoc_dir = 'doc'
12
+ task.title = 'BufferedTokenizer'
13
+ task.rdoc_files.include('lib/**/*.rb')
14
+ end
15
+
16
+ spec = Gem::Specification.new do |s|
17
+ s.name = %q{buftok}
18
+ s.version = "0.1"
19
+ s.date = %q{2006-12-18}
20
+ s.summary = %q{BufferedTokenizer extracts token delimited entities from a sequence of arbitrary inputs}
21
+ s.email = %q{tony@clickcaster.com}
22
+ s.homepage = %q{http://buftok.rubyforge.org}
23
+ s.rubyforge_project = %q{buftok}
24
+ s.has_rdoc = true
25
+ s.authors = ["Tony Arcieri","Martin Emde"]
26
+ s.files = ["Rakefile", "lib", "lib/buftok.rb"]
27
+ end
28
+
29
+ Rake::GemPackageTask.new(spec) do |pkg|
30
+ pkg.need_tar = true
31
+ end
@@ -0,0 +1,76 @@
1
+ # BufferedTokenizer - Statefully split input data by a specifiable token
2
+ # (C)2006 Tony Arcieri, Martin Emde
3
+ # Distributed under the Ruby license (http://www.ruby-lang.org/en/LICENSE.txt)
4
+
5
+ # BufferedTokenizer takes a delimiter upon instantiation, or acts line-based
6
+ # by default. It allows input to be spoon-fed from some outside source which
7
+ # receives arbitrary length datagrams which may-or-may-not contain the token
8
+ # by which entities are delimited. In this respect it's ideally paired with
9
+ # something like EventMachine (http://rubyforge.org/projects/eventmachine)
10
+ class BufferedTokenizer
11
+ # New BufferedTokenizers will operate on lines delimited by "\n" by default
12
+ # or allow you to specify any delimiter token you so choose, which will then
13
+ # be used by String#split to tokenize the input data
14
+ def initialize(delimiter = "\n")
15
+ # Store the specified delimiter
16
+ @delimiter = delimiter
17
+
18
+ # The input buffer is stored as an array. This is by far the most efficient
19
+ # approach given language constraints (in C a linked list would be a more
20
+ # appropriate data structure). Segments of input data are stored in a list
21
+ # which is only joined when a token is reached, substantially reducing the
22
+ # number of objects required for the operation.
23
+ @input = []
24
+ end
25
+
26
+ # Extract takes an arbitrary string of input data and returns an array of
27
+ # tokenized entities, provided there were any available to extract. This
28
+ # makes for easy processing of datagrams using a pattern like:
29
+ #
30
+ # tokenizer.extract(data).map { |entity| Decode(entity) }.each do ...
31
+ def extract(data)
32
+ # Extract token-delimited entities from the input string with the split command.
33
+ # There's a bit of craftiness here with the -1 parameter. Normally split would
34
+ # behave no differently regardless of if the token lies at the very end of the
35
+ # input buffer or not (i.e. a literal edge case) Specifying -1 forces split to
36
+ # return "" in this case, meaning that the last entry in the list represents a
37
+ # new segment of data where the token has not been encountered
38
+ entities = data.split @delimiter, -1
39
+
40
+ # Move the first entry in the resulting array into the input buffer. It represents
41
+ # the last segment of a token-delimited entity unless it's the only entry in the list.
42
+ @input << entities.shift
43
+
44
+ # If the resulting array from the split is empty, the token was not encountered
45
+ # (not even at the end of the buffer). Since we've encountered no token-delimited
46
+ # entities this go-around, return an empty array.
47
+ return [] if entities.empty?
48
+
49
+ # At this point, we've hit a token, or potentially multiple tokens. Now we can bring
50
+ # together all the data we've buffered from earlier calls without hitting a token,
51
+ # and add it to our list of discovered entities.
52
+ entities.unshift @input.join
53
+
54
+ # Now that we've hit a token, joined the input buffer and added it to the entities
55
+ # list, we can go ahead and clear the input buffer. All of the segments that were
56
+ # stored before the join can now be garbage collected.
57
+ @input.clear
58
+
59
+ # The last entity in the list is not token delimited, however, thanks to the -1
60
+ # passed to split. It represents the beginning of a new list of as-yet-untokenized
61
+ # data, so we add it to the start of the list.
62
+ @input << entities.pop
63
+
64
+ # Now we're left with the list of extracted token-delimited entities we wanted
65
+ # in the first place. Hooray!
66
+ entities
67
+ end
68
+
69
+ # Flush the contents of the input buffer, i.e. return the input buffer even though
70
+ # a token has not yet been encountered
71
+ def flush
72
+ buffer = @input.join
73
+ @input.clear
74
+ buffer
75
+ end
76
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: buftok
5
+ version: !ruby/object:Gem::Version
6
+ version: "0.1"
7
+ date: 2006-12-18 00:00:00 -07:00
8
+ summary: BufferedTokenizer extracts token delimited entities from a sequence of arbitrary inputs
9
+ require_paths:
10
+ - lib
11
+ email: tony@clickcaster.com
12
+ homepage: http://buftok.rubyforge.org
13
+ rubyforge_project: buftok
14
+ description:
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Tony Arcieri
31
+ - Martin Emde
32
+ files:
33
+ - Rakefile
34
+ - lib
35
+ - lib/buftok.rb
36
+ test_files: []
37
+
38
+ rdoc_options: []
39
+
40
+ extra_rdoc_files: []
41
+
42
+ executables: []
43
+
44
+ extensions: []
45
+
46
+ requirements: []
47
+
48
+ dependencies: []
49
+