detabulator 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +35 -0
- data/lib/detabulator.rb +50 -0
- data/lib/detabulator/version.rb +3 -0
- metadata +58 -0
data/README.md
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
Detabulator
|
2
|
+
===========
|
3
|
+
|
4
|
+
Extract columnar data from tabulated fixed-width text.
|
5
|
+
|
6
|
+
Example
|
7
|
+
-------
|
8
|
+
|
9
|
+
Given some tabulated data:
|
10
|
+
|
11
|
+
sample = <<END
|
12
|
+
Column 1 Column 2
|
13
|
+
Drinks Beer Whiskey
|
14
|
+
Not drinks Toothpaste Mouthwash
|
15
|
+
END
|
16
|
+
|
17
|
+
This:
|
18
|
+
|
19
|
+
require "detabulator"
|
20
|
+
Detabulator.new.detabulate(sample)
|
21
|
+
|
22
|
+
Will produce this:
|
23
|
+
|
24
|
+
[["", "Column 1", "Column 2"],
|
25
|
+
["Drinks", "Beer", "Whiskey"],
|
26
|
+
["Not drinks", "Toothpaste", "Mouthwash"]]
|
27
|
+
|
28
|
+
This is just a first release that does something useful.
|
29
|
+
|
30
|
+
Limitations
|
31
|
+
-----------
|
32
|
+
|
33
|
+
* Double-width characters (e.g. Japanese) are not handled correctly.
|
34
|
+
* Combining diacritics are not handled correctly.
|
35
|
+
* When one cell contains much longer text and a space, an extra column is generated.
|
data/lib/detabulator.rb
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
class Detabulator
|
2
|
+
SPACE = 32
|
3
|
+
|
4
|
+
def detabulate(s)
|
5
|
+
lines = s.split(/\n/).map{ |line| line.unpack("U*") }
|
6
|
+
lengths = extract_segment_lengths(
|
7
|
+
collapse_space_clusters(
|
8
|
+
space_mask(lines)))
|
9
|
+
|
10
|
+
offset = nil # GC help
|
11
|
+
lines.map{ |line|
|
12
|
+
offset = 0
|
13
|
+
lengths.map{ |length|
|
14
|
+
cell = (line[offset, length] || []).pack('U*').strip
|
15
|
+
offset += length
|
16
|
+
cell }}
|
17
|
+
end
|
18
|
+
|
19
|
+
private
|
20
|
+
# aa bb
|
21
|
+
# c dddd ee
|
22
|
+
# => ..TTT....T..
|
23
|
+
#
|
24
|
+
def space_mask(lines)
|
25
|
+
max_line_length = lines.map{ |a| a.length }.max
|
26
|
+
lines.inject([true] * max_line_length){ |mask, line|
|
27
|
+
mask.zip(line).
|
28
|
+
map{ |a, b| a && (!b || b == SPACE) }}
|
29
|
+
end
|
30
|
+
|
31
|
+
# ..TTT....T..
|
32
|
+
# => ..T......T..
|
33
|
+
#
|
34
|
+
def collapse_space_clusters(mask)
|
35
|
+
rmask = mask.reverse
|
36
|
+
rmask.zip(rmask[1..-1]).map{ |a, b| a && !b }.reverse
|
37
|
+
end
|
38
|
+
|
39
|
+
# ..T......T.. => [3, 7, 2]
|
40
|
+
#
|
41
|
+
def extract_segment_lengths(mask)
|
42
|
+
mask.inject([1]){ |a,e|
|
43
|
+
if e
|
44
|
+
a << 1
|
45
|
+
else
|
46
|
+
a[-1] += 1
|
47
|
+
a
|
48
|
+
end }
|
49
|
+
end
|
50
|
+
end
|
metadata
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: detabulator
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Paul Battley
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-07-18 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Extract columnar data from tabulated fixed-width text
|
17
|
+
email:
|
18
|
+
- pbattley@gmail.com
|
19
|
+
executables: []
|
20
|
+
|
21
|
+
extensions: []
|
22
|
+
|
23
|
+
extra_rdoc_files: []
|
24
|
+
|
25
|
+
files:
|
26
|
+
- lib/detabulator.rb
|
27
|
+
- lib/detabulator/version.rb
|
28
|
+
- README.md
|
29
|
+
has_rdoc: true
|
30
|
+
homepage: http://github.com/threedaymonk/detabulator
|
31
|
+
licenses: []
|
32
|
+
|
33
|
+
post_install_message:
|
34
|
+
rdoc_options: []
|
35
|
+
|
36
|
+
require_paths:
|
37
|
+
- lib
|
38
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: "0"
|
43
|
+
version:
|
44
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: "0"
|
49
|
+
version:
|
50
|
+
requirements: []
|
51
|
+
|
52
|
+
rubyforge_project:
|
53
|
+
rubygems_version: 1.3.5
|
54
|
+
signing_key:
|
55
|
+
specification_version: 3
|
56
|
+
summary: Extract columnar data from tabulated fixed-width text
|
57
|
+
test_files: []
|
58
|
+
|