tag_along 0.7.3 → 0.8.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG +3 -0
- data/Gemfile +1 -0
- data/README.md +28 -0
- data/lib/tag_along/offsets.rb +27 -8
- data/lib/tag_along/tagged_text.rb +132 -0
- data/lib/tag_along/version.rb +1 -1
- data/lib/tag_along.rb +2 -2
- data/spec/spec_helper.rb +13 -0
- data/spec/tag_along/tagged_text_spec.rb +46 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 655afbd01a5e5232292d7795d5623182ac903813
|
4
|
+
data.tar.gz: 85eda2423fe4919d01e09e168cdccb7f0dc4ea8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d0c4911a2bbc2d4d515d77bdede1064bb0a1e717c519307b22e247799f742fdff92577bfa9d4eb988f7e8bf19ec7eb107d025c613c5229ec958f4de4fe7fd8c0
|
7
|
+
data.tar.gz: 32cc83d07e519ebdf9f7b77820cf1da8506feb0c7f5c6e25434b252b4291fa82b70440ddeb1470f8270e16b657d7fcf6f313f49173dc1367afde9575aa8b8330
|
data/.gitignore
CHANGED
data/CHANGELOG
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -53,6 +53,34 @@ To add tags to a text:
|
|
53
53
|
|
54
54
|
Notice that you can retag the text as many times as you want.
|
55
55
|
|
56
|
+
### Tagging html converted to plain text
|
57
|
+
|
58
|
+
Sometimes it is necessary to convert html text to plain text to make sure that
|
59
|
+
search items are not separated by html tags. For situations like this gem
|
60
|
+
creates an intermediary structure which 'remembers' position of html tags and
|
61
|
+
recreates correct offsets from offsets of the plain text.
|
62
|
+
|
63
|
+
html_text = "
|
64
|
+
<html>
|
65
|
+
<head>
|
66
|
+
<title>Days of the week</title>
|
67
|
+
</head>
|
68
|
+
<body>
|
69
|
+
<p>
|
70
|
+
There's <strong>Sunday</strong>
|
71
|
+
and there's <strong>Monday</strong>
|
72
|
+
</p>
|
73
|
+
</body>
|
74
|
+
</html>
|
75
|
+
"
|
76
|
+
tt = TagAlong::TaggedText.new(html_text)
|
77
|
+
text = hc.plain_text
|
78
|
+
# returns "There's Sunday and there's Monday" with lots of space junk
|
79
|
+
text_offsets = [[8, 13], [27, 32]]
|
80
|
+
html_offsets = hc.adjust_offsets(text_offsets)
|
81
|
+
tg = TagAlong.new(html_text, html_offsets)
|
82
|
+
tg.tag('<my_tag>', '</my_tag>')
|
83
|
+
|
56
84
|
### Dynamic tags
|
57
85
|
|
58
86
|
Sometimes tags contain changeable component. To add dynamic data to tags:
|
data/lib/tag_along/offsets.rb
CHANGED
@@ -10,13 +10,13 @@ class TagAlong
|
|
10
10
|
@offset_end = (opts[:offset_end] || 'offset_end').to_sym
|
11
11
|
@data_start = (opts[:data_start] || 'data_start').to_sym
|
12
12
|
@data_end = (opts[:data_end] || 'data_end').to_sym
|
13
|
-
|
13
|
+
|
14
14
|
item = @offsets.first
|
15
15
|
if item.is_a?(Array)
|
16
16
|
process_array
|
17
17
|
elsif item.is_a?(Hash)
|
18
18
|
process_hash
|
19
|
-
else
|
19
|
+
else
|
20
20
|
process_obj
|
21
21
|
end
|
22
22
|
@offsets.sort_by!(&:offset_start)
|
@@ -28,6 +28,25 @@ class TagAlong
|
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
31
|
+
def [](num)
|
32
|
+
@offsets[num]
|
33
|
+
end
|
34
|
+
|
35
|
+
def shift
|
36
|
+
@offsets.shift
|
37
|
+
end
|
38
|
+
|
39
|
+
def empty?
|
40
|
+
@offsets.empty?
|
41
|
+
end
|
42
|
+
|
43
|
+
def << offset
|
44
|
+
unless offset.respond_to?(:offset_start) && offset.respond_to?(:offset_end)
|
45
|
+
raise TypeError.new('Object does not match Offset signature')
|
46
|
+
end
|
47
|
+
@offsets << offset
|
48
|
+
end
|
49
|
+
|
31
50
|
private
|
32
51
|
|
33
52
|
def process_array
|
@@ -36,17 +55,17 @@ class TagAlong
|
|
36
55
|
offset_end = o[1]
|
37
56
|
data_start = o[2]
|
38
57
|
data_end = o[3]
|
39
|
-
instantiate(offset_start, offset_end, data_start, data_end)
|
58
|
+
instantiate(offset_start, offset_end, data_start, data_end)
|
40
59
|
end
|
41
60
|
end
|
42
61
|
|
43
62
|
def process_hash
|
44
63
|
@offsets.each { |h| symbolize_keys(h) }
|
45
64
|
@offsets = @offsets.map do |h|
|
46
|
-
instantiate(h[@offset_start],
|
65
|
+
instantiate(h[@offset_start],
|
47
66
|
h[@offset_end],
|
48
67
|
h[@data_start],
|
49
|
-
h[@data_end])
|
68
|
+
h[@data_end])
|
50
69
|
end
|
51
70
|
end
|
52
71
|
|
@@ -56,10 +75,10 @@ class TagAlong
|
|
56
75
|
offset_end = obj.send(@offset_end)
|
57
76
|
data_start = obj.send(@data_start)
|
58
77
|
data_end = obj.send(@data_end)
|
59
|
-
instantiate(offset_start, offset_end, data_start, data_end)
|
78
|
+
instantiate(offset_start, offset_end, data_start, data_end)
|
60
79
|
end
|
61
80
|
end
|
62
|
-
|
81
|
+
|
63
82
|
def instantiate(offset_start, offset_end, data_start = nil, data_end = nil)
|
64
83
|
data_start = data_to_ary(data_start)
|
65
84
|
data_end = data_to_ary(data_end)
|
@@ -85,7 +104,7 @@ class TagAlong
|
|
85
104
|
a_hash[(key.to_sym rescue key) || key] = a_hash.delete(key)
|
86
105
|
end
|
87
106
|
end
|
88
|
-
|
107
|
+
|
89
108
|
end
|
90
109
|
|
91
110
|
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
require 'polyglot'
|
2
|
+
require 'treetop'
|
3
|
+
|
4
|
+
class TagAlong
|
5
|
+
|
6
|
+
class TaggedText
|
7
|
+
CHR = { '<' => 60, '>' => 62 }
|
8
|
+
SPACES = { 9 => true, 10 => true, 11 => true, 12 => true,
|
9
|
+
13 => true, 32 => true, 133 => true, 160 => true,
|
10
|
+
5760 => true, 6158 => true, 8192 => true,
|
11
|
+
8193 => true, 8194 => true, 8195 => true,
|
12
|
+
8196 => true, 8197 => true, 8198 => true,
|
13
|
+
8199 => true, 8200 => true, 8201 => true,
|
14
|
+
8202 => true, 8232 => true, 8233 => true,
|
15
|
+
8239 => true, 8287 => true, 12288 => true }
|
16
|
+
|
17
|
+
attr_reader :tagged_text, :offsets
|
18
|
+
|
19
|
+
def initialize(tagged_text, opts = {})
|
20
|
+
@normalize_spaces = true if opts[:normalize_spaces]
|
21
|
+
@tagged_text = tagged_text
|
22
|
+
@inside_tag = false
|
23
|
+
@inside_space = false
|
24
|
+
@offsets = []
|
25
|
+
@text = []
|
26
|
+
@text_offset = 0
|
27
|
+
@current_offset = { type: :text, start: 0, end: nil,
|
28
|
+
text_start: 0, text_end: nil }
|
29
|
+
process_tagged_text
|
30
|
+
end
|
31
|
+
|
32
|
+
def plain_text
|
33
|
+
@text.pack('U*')
|
34
|
+
end
|
35
|
+
|
36
|
+
def adjust_offsets(plain_text_offsets)
|
37
|
+
plain_text_offsets = plain_text_offsets.is_a?(Offsets) ?
|
38
|
+
plain_text_offsets :
|
39
|
+
Offsets.new(plain_text_offsets)
|
40
|
+
adjusted_offsets = TagAlong::Offsets.new([])
|
41
|
+
@offsets.each do |offset|
|
42
|
+
next if offset[:type] == :tag
|
43
|
+
process_offset(plain_text_offsets, offset, adjusted_offsets)
|
44
|
+
break if plain_text_offsets.empty?
|
45
|
+
end
|
46
|
+
adjusted_offsets
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def process_offset(plain_text_offsets, offset, adjusted_offsets)
|
53
|
+
o = plain_text_offsets[0]
|
54
|
+
|
55
|
+
return if o.offset_start > offset[:text_start]
|
56
|
+
unless o.adj_start
|
57
|
+
delta = o.offset_start - offset[:text_start]
|
58
|
+
o.adj_start = offset[:start] + delta
|
59
|
+
end
|
60
|
+
|
61
|
+
if o.offset_end <= offset[:text_end]
|
62
|
+
delta = o.offset_end - offset[:text_end]
|
63
|
+
o = plain_text_offsets.shift
|
64
|
+
o.offset_start = o.delete_field(:adj_start)
|
65
|
+
o.offset_end = offset[:end] + delta
|
66
|
+
adjusted_offsets << o
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def process_tagged_text
|
71
|
+
opts = { count: 0, chr: nil }
|
72
|
+
while opts[:chr] = tagged_text_ary.shift
|
73
|
+
@inside_tag ? process_inside_tag(opts) : process_outside_tag(opts)
|
74
|
+
opts[:count] += 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def tagged_text_ary
|
79
|
+
@tagged_text_ary ||= @tagged_text.unpack('U*')
|
80
|
+
end
|
81
|
+
|
82
|
+
def process_outside_tag(opts)
|
83
|
+
if opts[:chr] == CHR['<']
|
84
|
+
@inside_tag = true
|
85
|
+
if opts[:count] > 0
|
86
|
+
@current_offset[:end] = opts[:count] - 1
|
87
|
+
@current_offset[:text_end] = @text_offset - 1
|
88
|
+
@offsets << @current_offset
|
89
|
+
end
|
90
|
+
@current_offset = { type: :tag, start: opts[:count], end: nil }
|
91
|
+
else
|
92
|
+
process_text(opts)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def process_inside_tag(opts)
|
97
|
+
if opts[:chr] == CHR['>']
|
98
|
+
@inside_tag = false
|
99
|
+
@current_offset[:end] = opts[:count]
|
100
|
+
@offsets << @current_offset
|
101
|
+
@current_offset = { type: :text, start: opts[:count] + 1, end: nil,
|
102
|
+
text_start: @text_offset, text_end: nil }
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def process_text(opts)
|
107
|
+
if @normalize_spaces
|
108
|
+
process_normalized_spaces_text(opts)
|
109
|
+
else
|
110
|
+
add_to_text(opts)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def add_to_text(opts)
|
115
|
+
@text_offset += 1
|
116
|
+
@text << opts[:chr]
|
117
|
+
end
|
118
|
+
|
119
|
+
def process_normalized_spaces_text(opts)
|
120
|
+
@inside_space ? process_inside_space(opts) : process_outside_space(opts)
|
121
|
+
end
|
122
|
+
|
123
|
+
def process_inside_space(opts)
|
124
|
+
#TODO
|
125
|
+
end
|
126
|
+
|
127
|
+
def process_outside_space(opts)
|
128
|
+
#TODO
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
data/lib/tag_along/version.rb
CHANGED
data/lib/tag_along.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -29,5 +29,18 @@ unless defined?(SPEC_VARS)
|
|
29
29
|
FILES_DIR = File.expand_path(File.join(File.dirname(__FILE__), 'files'))
|
30
30
|
TEXT, OFFSETS_ARY, OFFSETS_HASH, OFFSETS_OBJ =
|
31
31
|
TagAlongSpec.process_spec_data(FILES_DIR)
|
32
|
+
HTML_TEXT = "
|
33
|
+
<html>
|
34
|
+
<head>
|
35
|
+
<title>Days of the week</title>
|
36
|
+
</head>
|
37
|
+
<body>
|
38
|
+
<p>
|
39
|
+
There's <strong>Sunday</strong>
|
40
|
+
and there's <strong>Monday</strong>
|
41
|
+
</p>
|
42
|
+
</body>
|
43
|
+
</html>
|
44
|
+
"
|
32
45
|
SPEC_VARS = true
|
33
46
|
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require_relative '../spec_helper'
|
2
|
+
|
3
|
+
describe TagAlong::TaggedText do
|
4
|
+
|
5
|
+
describe 'No space normalization' do
|
6
|
+
|
7
|
+
let(:text) do
|
8
|
+
"\n \n \n Days of the week" +
|
9
|
+
"\n \n \n \n There's" +
|
10
|
+
" Sunday\n and there's Monday" +
|
11
|
+
"\n \n \n \n "
|
12
|
+
end
|
13
|
+
|
14
|
+
let(:text_offsets) { [[77,82],[104,109]] }
|
15
|
+
|
16
|
+
subject { TagAlong::TaggedText.new(HTML_TEXT) }
|
17
|
+
|
18
|
+
its(:tagged_text) { should == HTML_TEXT }
|
19
|
+
its(:plain_text) { should == text }
|
20
|
+
it 'should get offsets' do
|
21
|
+
subject.offsets[1].should == { type: :tag , start: 5, end: 10 }
|
22
|
+
subject.offsets[2].should == { type: :text , start: 11,
|
23
|
+
end: 17, text_start: 5, text_end: 11 }
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should adjust offsets' do
|
27
|
+
text_offsets.should == [[77,82],[104,109]]
|
28
|
+
offsets = subject.adjust_offsets(text_offsets)
|
29
|
+
offsets.should be_kind_of TagAlong::Offsets
|
30
|
+
offsets.map { |o| [o.offset_start, o.offset_end] }.should ==
|
31
|
+
[[128, 133], [172, 177]]
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
describe 'space normalization' do
|
37
|
+
let(:text) { "Days of the week There's Sunday and there's Monday" }
|
38
|
+
|
39
|
+
subject { TagAlong::TaggedText.new(HTML_TEXT, normalize_spaces: true) }
|
40
|
+
|
41
|
+
its(:tagged_text) { should == HTML_TEXT }
|
42
|
+
its(:plain_text) { should == text }
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tag_along
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-08 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |-
|
14
14
|
Tags a text with arbitrary tags
|
@@ -29,10 +29,12 @@ files:
|
|
29
29
|
- Rakefile
|
30
30
|
- lib/tag_along.rb
|
31
31
|
- lib/tag_along/offsets.rb
|
32
|
+
- lib/tag_along/tagged_text.rb
|
32
33
|
- lib/tag_along/version.rb
|
33
34
|
- spec/files/spec_data.json
|
34
35
|
- spec/spec_helper.rb
|
35
36
|
- spec/tag_along/offsets_spec.rb
|
37
|
+
- spec/tag_along/tagged_text_spec.rb
|
36
38
|
- spec/tag_along_spec.rb
|
37
39
|
- tag_along.gemspec
|
38
40
|
homepage: https://github.com/GlobalNamesArchitecture/tag_along
|
@@ -65,4 +67,5 @@ test_files:
|
|
65
67
|
- spec/files/spec_data.json
|
66
68
|
- spec/spec_helper.rb
|
67
69
|
- spec/tag_along/offsets_spec.rb
|
70
|
+
- spec/tag_along/tagged_text_spec.rb
|
68
71
|
- spec/tag_along_spec.rb
|