detabulator 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ Detabulator
2
+ ===========
3
+
4
+ Extract columnar data from tabulated fixed-width text.
5
+
6
+ Example
7
+ -------
8
+
9
+ Given some tabulated data:
10
+
11
+ sample = <<END
12
+ Column 1 Column 2
13
+ Drinks Beer Whiskey
14
+ Not drinks Toothpaste Mouthwash
15
+ END
16
+
17
+ This:
18
+
19
+ require "detabulator"
20
+ Detabulator.new.detabulate(sample)
21
+
22
+ Will produce this:
23
+
24
+ [["", "Column 1", "Column 2"],
25
+ ["Drinks", "Beer", "Whiskey"],
26
+ ["Not drinks", "Toothpaste", "Mouthwash"]]
27
+
28
+ This is just a first release that does something useful.
29
+
30
+ Limitations
31
+ -----------
32
+
33
+ * Double-width characters (e.g. Japanese) are not handled correctly.
34
+ * Combining diacritics are not handled correctly.
35
+ * When one cell contains much longer text and a space, an extra column is generated.
@@ -0,0 +1,50 @@
1
+ class Detabulator
2
+ SPACE = 32
3
+
4
+ def detabulate(s)
5
+ lines = s.split(/\n/).map{ |line| line.unpack("U*") }
6
+ lengths = extract_segment_lengths(
7
+ collapse_space_clusters(
8
+ space_mask(lines)))
9
+
10
+ offset = nil # GC help
11
+ lines.map{ |line|
12
+ offset = 0
13
+ lengths.map{ |length|
14
+ cell = (line[offset, length] || []).pack('U*').strip
15
+ offset += length
16
+ cell }}
17
+ end
18
+
19
+ private
20
+ # aa bb
21
+ # c dddd ee
22
+ # => ..TTT....T..
23
+ #
24
+ def space_mask(lines)
25
+ max_line_length = lines.map{ |a| a.length }.max
26
+ lines.inject([true] * max_line_length){ |mask, line|
27
+ mask.zip(line).
28
+ map{ |a, b| a && (!b || b == SPACE) }}
29
+ end
30
+
31
+ # ..TTT....T..
32
+ # => ..T......T..
33
+ #
34
+ def collapse_space_clusters(mask)
35
+ rmask = mask.reverse
36
+ rmask.zip(rmask[1..-1]).map{ |a, b| a && !b }.reverse
37
+ end
38
+
39
+ # ..T......T.. => [3, 7, 2]
40
+ #
41
+ def extract_segment_lengths(mask)
42
+ mask.inject([1]){ |a,e|
43
+ if e
44
+ a << 1
45
+ else
46
+ a[-1] += 1
47
+ a
48
+ end }
49
+ end
50
+ end
@@ -0,0 +1,3 @@
1
+ class Detabulator
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: detabulator
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Paul Battley
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-07-18 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Extract columnar data from tabulated fixed-width text
17
+ email:
18
+ - pbattley@gmail.com
19
+ executables: []
20
+
21
+ extensions: []
22
+
23
+ extra_rdoc_files: []
24
+
25
+ files:
26
+ - lib/detabulator.rb
27
+ - lib/detabulator/version.rb
28
+ - README.md
29
+ has_rdoc: true
30
+ homepage: http://github.com/threedaymonk/detabulator
31
+ licenses: []
32
+
33
+ post_install_message:
34
+ rdoc_options: []
35
+
36
+ require_paths:
37
+ - lib
38
+ required_ruby_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: "0"
43
+ version:
44
+ required_rubygems_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: "0"
49
+ version:
50
+ requirements: []
51
+
52
+ rubyforge_project:
53
+ rubygems_version: 1.3.5
54
+ signing_key:
55
+ specification_version: 3
56
+ summary: Extract columnar data from tabulated fixed-width text
57
+ test_files: []
58
+