detabulator 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +35 -0
- data/lib/detabulator.rb +50 -0
- data/lib/detabulator/version.rb +3 -0
- metadata +58 -0
data/README.md
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
Detabulator
|
2
|
+
===========
|
3
|
+
|
4
|
+
Extract columnar data from tabulated fixed-width text.
|
5
|
+
|
6
|
+
Example
|
7
|
+
-------
|
8
|
+
|
9
|
+
Given some tabulated data:
|
10
|
+
|
11
|
+
sample = <<END
|
12
|
+
Column 1 Column 2
|
13
|
+
Drinks Beer Whiskey
|
14
|
+
Not drinks Toothpaste Mouthwash
|
15
|
+
END
|
16
|
+
|
17
|
+
This:
|
18
|
+
|
19
|
+
require "detabulator"
|
20
|
+
Detabulator.new.detabulate(sample)
|
21
|
+
|
22
|
+
Will produce this:
|
23
|
+
|
24
|
+
[["", "Column 1", "Column 2"],
|
25
|
+
["Drinks", "Beer", "Whiskey"],
|
26
|
+
["Not drinks", "Toothpaste", "Mouthwash"]]
|
27
|
+
|
28
|
+
This is just a first release that does something useful.
|
29
|
+
|
30
|
+
Limitations
|
31
|
+
-----------
|
32
|
+
|
33
|
+
* Double-width characters (e.g. Japanese) are not handled correctly.
|
34
|
+
* Combining diacritics are not handled correctly.
|
35
|
+
* When one cell contains much longer text and a space, an extra column is generated.
|
data/lib/detabulator.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
class Detabulator
|
2
|
+
SPACE = 32
|
3
|
+
|
4
|
+
def detabulate(s)
|
5
|
+
lines = s.split(/\n/).map{ |line| line.unpack("U*") }
|
6
|
+
lengths = extract_segment_lengths(
|
7
|
+
collapse_space_clusters(
|
8
|
+
space_mask(lines)))
|
9
|
+
|
10
|
+
offset = nil # GC help
|
11
|
+
lines.map{ |line|
|
12
|
+
offset = 0
|
13
|
+
lengths.map{ |length|
|
14
|
+
cell = (line[offset, length] || []).pack('U*').strip
|
15
|
+
offset += length
|
16
|
+
cell }}
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
# aa bb
|
21
|
+
# c dddd ee
|
22
|
+
# => ..TTT....T..
|
23
|
+
#
|
24
|
+
def space_mask(lines)
|
25
|
+
max_line_length = lines.map{ |a| a.length }.max
|
26
|
+
lines.inject([true] * max_line_length){ |mask, line|
|
27
|
+
mask.zip(line).
|
28
|
+
map{ |a, b| a && (!b || b == SPACE) }}
|
29
|
+
end
|
30
|
+
|
31
|
+
# ..TTT....T..
|
32
|
+
# => ..T......T..
|
33
|
+
#
|
34
|
+
def collapse_space_clusters(mask)
|
35
|
+
rmask = mask.reverse
|
36
|
+
rmask.zip(rmask[1..-1]).map{ |a, b| a && !b }.reverse
|
37
|
+
end
|
38
|
+
|
39
|
+
# ..T......T.. => [3, 7, 2]
|
40
|
+
#
|
41
|
+
def extract_segment_lengths(mask)
|
42
|
+
mask.inject([1]){ |a,e|
|
43
|
+
if e
|
44
|
+
a << 1
|
45
|
+
else
|
46
|
+
a[-1] += 1
|
47
|
+
a
|
48
|
+
end }
|
49
|
+
end
|
50
|
+
end
|
metadata
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: detabulator
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Paul Battley
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-07-18 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Extract columnar data from tabulated fixed-width text
|
17
|
+
email:
|
18
|
+
- pbattley@gmail.com
|
19
|
+
executables: []
|
20
|
+
|
21
|
+
extensions: []
|
22
|
+
|
23
|
+
extra_rdoc_files: []
|
24
|
+
|
25
|
+
files:
|
26
|
+
- lib/detabulator.rb
|
27
|
+
- lib/detabulator/version.rb
|
28
|
+
- README.md
|
29
|
+
has_rdoc: true
|
30
|
+
homepage: http://github.com/threedaymonk/detabulator
|
31
|
+
licenses: []
|
32
|
+
|
33
|
+
post_install_message:
|
34
|
+
rdoc_options: []
|
35
|
+
|
36
|
+
require_paths:
|
37
|
+
- lib
|
38
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: "0"
|
43
|
+
version:
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: "0"
|
49
|
+
version:
|
50
|
+
requirements: []
|
51
|
+
|
52
|
+
rubyforge_project:
|
53
|
+
rubygems_version: 1.3.5
|
54
|
+
signing_key:
|
55
|
+
specification_version: 3
|
56
|
+
summary: Extract columnar data from tabulated fixed-width text
|
57
|
+
test_files: []
|
58
|
+
|