detabulator 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,35 @@
1
+ Detabulator
2
+ ===========
3
+
4
+ Extract columnar data from tabulated fixed-width text.
5
+
6
+ Example
7
+ -------
8
+
9
+ Given some tabulated data:
10
+
11
+ sample = <<END
12
+ Column 1 Column 2
13
+ Drinks Beer Whiskey
14
+ Not drinks Toothpaste Mouthwash
15
+ END
16
+
17
+ This:
18
+
19
+ require "detabulator"
20
+ Detabulator.new.detabulate(sample)
21
+
22
+ Will produce this:
23
+
24
+ [["", "Column 1", "Column 2"],
25
+ ["Drinks", "Beer", "Whiskey"],
26
+ ["Not drinks", "Toothpaste", "Mouthwash"]]
27
+
28
+ This is just a first release that does something useful.
29
+
30
+ Limitations
31
+ -----------
32
+
33
+ * Double-width characters (e.g. Japanese) are not handled correctly.
34
+ * Combining diacritics are not handled correctly.
35
+ * When one cell contains much longer text and a space, an extra column is generated.
@@ -0,0 +1,50 @@
1
+ class Detabulator
2
+ SPACE = 32
3
+
4
+ def detabulate(s)
5
+ lines = s.split(/\n/).map{ |line| line.unpack("U*") }
6
+ lengths = extract_segment_lengths(
7
+ collapse_space_clusters(
8
+ space_mask(lines)))
9
+
10
+ offset = nil # GC help
11
+ lines.map{ |line|
12
+ offset = 0
13
+ lengths.map{ |length|
14
+ cell = (line[offset, length] || []).pack('U*').strip
15
+ offset += length
16
+ cell }}
17
+ end
18
+
19
+ private
20
+ # aa bb
21
+ # c dddd ee
22
+ # => ..TTT....T..
23
+ #
24
+ def space_mask(lines)
25
+ max_line_length = lines.map{ |a| a.length }.max
26
+ lines.inject([true] * max_line_length){ |mask, line|
27
+ mask.zip(line).
28
+ map{ |a, b| a && (!b || b == SPACE) }}
29
+ end
30
+
31
+ # ..TTT....T..
32
+ # => ..T......T..
33
+ #
34
+ def collapse_space_clusters(mask)
35
+ rmask = mask.reverse
36
+ rmask.zip(rmask[1..-1]).map{ |a, b| a && !b }.reverse
37
+ end
38
+
39
+ # ..T......T.. => [3, 7, 2]
40
+ #
41
+ def extract_segment_lengths(mask)
42
+ mask.inject([1]){ |a,e|
43
+ if e
44
+ a << 1
45
+ else
46
+ a[-1] += 1
47
+ a
48
+ end }
49
+ end
50
+ end
@@ -0,0 +1,3 @@
1
+ class Detabulator
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,58 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: detabulator
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Paul Battley
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-07-18 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Extract columnar data from tabulated fixed-width text
17
+ email:
18
+ - pbattley@gmail.com
19
+ executables: []
20
+
21
+ extensions: []
22
+
23
+ extra_rdoc_files: []
24
+
25
+ files:
26
+ - lib/detabulator.rb
27
+ - lib/detabulator/version.rb
28
+ - README.md
29
+ has_rdoc: true
30
+ homepage: http://github.com/threedaymonk/detabulator
31
+ licenses: []
32
+
33
+ post_install_message:
34
+ rdoc_options: []
35
+
36
+ require_paths:
37
+ - lib
38
+ required_ruby_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: "0"
43
+ version:
44
+ required_rubygems_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: "0"
49
+ version:
50
+ requirements: []
51
+
52
+ rubyforge_project:
53
+ rubygems_version: 1.3.5
54
+ signing_key:
55
+ specification_version: 3
56
+ summary: Extract columnar data from tabulated fixed-width text
57
+ test_files: []
58
+