buftok 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/Rakefile +31 -0
  2. data/lib/buftok.rb +76 -0
  3. metadata +49 -0
@@ -0,0 +1,31 @@
1
+ require 'rake'
2
+ require 'rake/rdoctask'
3
+ require 'rake/gempackagetask'
4
+ require 'spec/rake/spectask'
5
+
6
+ Spec::Rake::SpecTask.new(:spec) do |task|
7
+ task.spec_files = FileList['**/*_spec.rb']
8
+ end
9
+
10
+ Rake::RDocTask.new(:rdoc) do |task|
11
+ task.rdoc_dir = 'doc'
12
+ task.title = 'BufferedTokenizer'
13
+ task.rdoc_files.include('lib/**/*.rb')
14
+ end
15
+
16
+ spec = Gem::Specification.new do |s|
17
+ s.name = %q{buftok}
18
+ s.version = "0.1"
19
+ s.date = %q{2006-12-18}
20
+ s.summary = %q{BufferedTokenizer extracts token delimited entities from a sequence of arbitrary inputs}
21
+ s.email = %q{tony@clickcaster.com}
22
+ s.homepage = %q{http://buftok.rubyforge.org}
23
+ s.rubyforge_project = %q{buftok}
24
+ s.has_rdoc = true
25
+ s.authors = ["Tony Arcieri","Martin Emde"]
26
+ s.files = ["Rakefile", "lib", "lib/buftok.rb"]
27
+ end
28
+
29
+ Rake::GemPackageTask.new(spec) do |pkg|
30
+ pkg.need_tar = true
31
+ end
@@ -0,0 +1,76 @@
1
+ # BufferedTokenizer - Statefully split input data by a specifiable token
2
+ # (C)2006 Tony Arcieri, Martin Emde
3
+ # Distributed under the Ruby license (http://www.ruby-lang.org/en/LICENSE.txt)
4
+
5
+ # BufferedTokenizer takes a delimiter upon instantiation, or acts line-based
6
+ # by default. It allows input to be spoon-fed from some outside source which
7
+ # receives arbitrary length datagrams which may-or-may-not contain the token
8
+ # by which entities are delimited. In this respect it's ideally paired with
9
+ # something like EventMachine (http://rubyforge.org/projects/eventmachine)
10
+ class BufferedTokenizer
11
+ # New BufferedTokenizers will operate on lines delimited by "\n" by default
12
+ # or allow you to specify any delimiter token you so choose, which will then
13
+ # be used by String#split to tokenize the input data
14
+ def initialize(delimiter = "\n")
15
+ # Store the specified delimiter
16
+ @delimiter = delimiter
17
+
18
+ # The input buffer is stored as an array. This is by far the most efficient
19
+ # approach given language constraints (in C a linked list would be a more
20
+ # appropriate data structure). Segments of input data are stored in a list
21
+ # which is only joined when a token is reached, substantially reducing the
22
+ # number of objects required for the operation.
23
+ @input = []
24
+ end
25
+
26
+ # Extract takes an arbitrary string of input data and returns an array of
27
+ # tokenized entities, provided there were any available to extract. This
28
+ # makes for easy processing of datagrams using a pattern like:
29
+ #
30
+ # tokenizer.extract(data).map { |entity| Decode(entity) }.each do ...
31
+ def extract(data)
32
+ # Extract token-delimited entities from the input string with the split command.
33
+ # There's a bit of craftiness here with the -1 parameter. Normally split would
34
+ # behave no differently regardless of if the token lies at the very end of the
35
+ # input buffer or not (i.e. a literal edge case) Specifying -1 forces split to
36
+ # return "" in this case, meaning that the last entry in the list represents a
37
+ # new segment of data where the token has not been encountered
38
+ entities = data.split @delimiter, -1
39
+
40
+ # Move the first entry in the resulting array into the input buffer. It represents
41
+ # the last segment of a token-delimited entity unless it's the only entry in the list.
42
+ @input << entities.shift
43
+
44
+ # If the resulting array from the split is empty, the token was not encountered
45
+ # (not even at the end of the buffer). Since we've encountered no token-delimited
46
+ # entities this go-around, return an empty array.
47
+ return [] if entities.empty?
48
+
49
+ # At this point, we've hit a token, or potentially multiple tokens. Now we can bring
50
+ # together all the data we've buffered from earlier calls without hitting a token,
51
+ # and add it to our list of discovered entities.
52
+ entities.unshift @input.join
53
+
54
+ # Now that we've hit a token, joined the input buffer and added it to the entities
55
+ # list, we can go ahead and clear the input buffer. All of the segments that were
56
+ # stored before the join can now be garbage collected.
57
+ @input.clear
58
+
59
+ # The last entity in the list is not token delimited, however, thanks to the -1
60
+ # passed to split. It represents the beginning of a new list of as-yet-untokenized
61
+ # data, so we add it to the start of the list.
62
+ @input << entities.pop
63
+
64
+ # Now we're left with the list of extracted token-delimited entities we wanted
65
+ # in the first place. Hooray!
66
+ entities
67
+ end
68
+
69
+ # Flush the contents of the input buffer, i.e. return the input buffer even though
70
+ # a token has not yet been encountered
71
+ def flush
72
+ buffer = @input.join
73
+ @input.clear
74
+ buffer
75
+ end
76
+ end
metadata ADDED
@@ -0,0 +1,49 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: buftok
5
+ version: !ruby/object:Gem::Version
6
+ version: "0.1"
7
+ date: 2006-12-18 00:00:00 -07:00
8
+ summary: BufferedTokenizer extracts token delimited entities from a sequence of arbitrary inputs
9
+ require_paths:
10
+ - lib
11
+ email: tony@clickcaster.com
12
+ homepage: http://buftok.rubyforge.org
13
+ rubyforge_project: buftok
14
+ description:
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Tony Arcieri
31
+ - Martin Emde
32
+ files:
33
+ - Rakefile
34
+ - lib
35
+ - lib/buftok.rb
36
+ test_files: []
37
+
38
+ rdoc_options: []
39
+
40
+ extra_rdoc_files: []
41
+
42
+ executables: []
43
+
44
+ extensions: []
45
+
46
+ requirements: []
47
+
48
+ dependencies: []
49
+