tag_along 0.7.3 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG +3 -0
- data/Gemfile +1 -0
- data/README.md +28 -0
- data/lib/tag_along/offsets.rb +27 -8
- data/lib/tag_along/tagged_text.rb +132 -0
- data/lib/tag_along/version.rb +1 -1
- data/lib/tag_along.rb +2 -2
- data/spec/spec_helper.rb +13 -0
- data/spec/tag_along/tagged_text_spec.rb +46 -0
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 655afbd01a5e5232292d7795d5623182ac903813
|
4
|
+
data.tar.gz: 85eda2423fe4919d01e09e168cdccb7f0dc4ea8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d0c4911a2bbc2d4d515d77bdede1064bb0a1e717c519307b22e247799f742fdff92577bfa9d4eb988f7e8bf19ec7eb107d025c613c5229ec958f4de4fe7fd8c0
|
7
|
+
data.tar.gz: 32cc83d07e519ebdf9f7b77820cf1da8506feb0c7f5c6e25434b252b4291fa82b70440ddeb1470f8270e16b657d7fcf6f313f49173dc1367afde9575aa8b8330
|
data/.gitignore
CHANGED
data/CHANGELOG
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -53,6 +53,34 @@ To add tags to a text:
|
|
53
53
|
|
54
54
|
Notice that you can retag the text as many times as you want.
|
55
55
|
|
56
|
+
### Tagging html converted to plain text
|
57
|
+
|
58
|
+
Sometimes it is necessary to convert html text to plain text to make sure that
|
59
|
+
search items are not separated by html tags. For situations like this gem
|
60
|
+
creates an intermediary structure which 'remembers' position of html tags and
|
61
|
+
recreates correct offsets from offsets of the plain text.
|
62
|
+
|
63
|
+
html_text = "
|
64
|
+
<html>
|
65
|
+
<head>
|
66
|
+
<title>Days of the week</title>
|
67
|
+
</head>
|
68
|
+
<body>
|
69
|
+
<p>
|
70
|
+
There's <strong>Sunday</strong>
|
71
|
+
and there's <strong>Monday</strong>
|
72
|
+
</p>
|
73
|
+
</body>
|
74
|
+
</html>
|
75
|
+
"
|
76
|
+
tt = TagAlong::TaggedText.new(html_text)
|
77
|
+
text = hc.plain_text
|
78
|
+
# returns "There's Sunday and there's Monday" with lots of space junk
|
79
|
+
text_offsets = [[8, 13], [27, 32]]
|
80
|
+
html_offsets = hc.adjust_offsets(text_offsets)
|
81
|
+
tg = TagAlong.new(html_text, html_offsets)
|
82
|
+
tg.tag('<my_tag>', '</my_tag>')
|
83
|
+
|
56
84
|
### Dynamic tags
|
57
85
|
|
58
86
|
Sometimes tags contain changeable component. To add dynamic data to tags:
|
data/lib/tag_along/offsets.rb
CHANGED
@@ -10,13 +10,13 @@ class TagAlong
|
|
10
10
|
@offset_end = (opts[:offset_end] || 'offset_end').to_sym
|
11
11
|
@data_start = (opts[:data_start] || 'data_start').to_sym
|
12
12
|
@data_end = (opts[:data_end] || 'data_end').to_sym
|
13
|
-
|
13
|
+
|
14
14
|
item = @offsets.first
|
15
15
|
if item.is_a?(Array)
|
16
16
|
process_array
|
17
17
|
elsif item.is_a?(Hash)
|
18
18
|
process_hash
|
19
|
-
else
|
19
|
+
else
|
20
20
|
process_obj
|
21
21
|
end
|
22
22
|
@offsets.sort_by!(&:offset_start)
|
@@ -28,6 +28,25 @@ class TagAlong
|
|
28
28
|
end
|
29
29
|
end
|
30
30
|
|
31
|
+
def [](num)
|
32
|
+
@offsets[num]
|
33
|
+
end
|
34
|
+
|
35
|
+
def shift
|
36
|
+
@offsets.shift
|
37
|
+
end
|
38
|
+
|
39
|
+
def empty?
|
40
|
+
@offsets.empty?
|
41
|
+
end
|
42
|
+
|
43
|
+
def << offset
|
44
|
+
unless offset.respond_to?(:offset_start) && offset.respond_to?(:offset_end)
|
45
|
+
raise TypeError.new('Object does not match Offset signature')
|
46
|
+
end
|
47
|
+
@offsets << offset
|
48
|
+
end
|
49
|
+
|
31
50
|
private
|
32
51
|
|
33
52
|
def process_array
|
@@ -36,17 +55,17 @@ class TagAlong
|
|
36
55
|
offset_end = o[1]
|
37
56
|
data_start = o[2]
|
38
57
|
data_end = o[3]
|
39
|
-
instantiate(offset_start, offset_end, data_start, data_end)
|
58
|
+
instantiate(offset_start, offset_end, data_start, data_end)
|
40
59
|
end
|
41
60
|
end
|
42
61
|
|
43
62
|
def process_hash
|
44
63
|
@offsets.each { |h| symbolize_keys(h) }
|
45
64
|
@offsets = @offsets.map do |h|
|
46
|
-
instantiate(h[@offset_start],
|
65
|
+
instantiate(h[@offset_start],
|
47
66
|
h[@offset_end],
|
48
67
|
h[@data_start],
|
49
|
-
h[@data_end])
|
68
|
+
h[@data_end])
|
50
69
|
end
|
51
70
|
end
|
52
71
|
|
@@ -56,10 +75,10 @@ class TagAlong
|
|
56
75
|
offset_end = obj.send(@offset_end)
|
57
76
|
data_start = obj.send(@data_start)
|
58
77
|
data_end = obj.send(@data_end)
|
59
|
-
instantiate(offset_start, offset_end, data_start, data_end)
|
78
|
+
instantiate(offset_start, offset_end, data_start, data_end)
|
60
79
|
end
|
61
80
|
end
|
62
|
-
|
81
|
+
|
63
82
|
def instantiate(offset_start, offset_end, data_start = nil, data_end = nil)
|
64
83
|
data_start = data_to_ary(data_start)
|
65
84
|
data_end = data_to_ary(data_end)
|
@@ -85,7 +104,7 @@ class TagAlong
|
|
85
104
|
a_hash[(key.to_sym rescue key) || key] = a_hash.delete(key)
|
86
105
|
end
|
87
106
|
end
|
88
|
-
|
107
|
+
|
89
108
|
end
|
90
109
|
|
91
110
|
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
require 'polyglot'
|
2
|
+
require 'treetop'
|
3
|
+
|
4
|
+
class TagAlong
|
5
|
+
|
6
|
+
class TaggedText
|
7
|
+
CHR = { '<' => 60, '>' => 62 }
|
8
|
+
SPACES = { 9 => true, 10 => true, 11 => true, 12 => true,
|
9
|
+
13 => true, 32 => true, 133 => true, 160 => true,
|
10
|
+
5760 => true, 6158 => true, 8192 => true,
|
11
|
+
8193 => true, 8194 => true, 8195 => true,
|
12
|
+
8196 => true, 8197 => true, 8198 => true,
|
13
|
+
8199 => true, 8200 => true, 8201 => true,
|
14
|
+
8202 => true, 8232 => true, 8233 => true,
|
15
|
+
8239 => true, 8287 => true, 12288 => true }
|
16
|
+
|
17
|
+
attr_reader :tagged_text, :offsets
|
18
|
+
|
19
|
+
def initialize(tagged_text, opts = {})
|
20
|
+
@normalize_spaces = true if opts[:normalize_spaces]
|
21
|
+
@tagged_text = tagged_text
|
22
|
+
@inside_tag = false
|
23
|
+
@inside_space = false
|
24
|
+
@offsets = []
|
25
|
+
@text = []
|
26
|
+
@text_offset = 0
|
27
|
+
@current_offset = { type: :text, start: 0, end: nil,
|
28
|
+
text_start: 0, text_end: nil }
|
29
|
+
process_tagged_text
|
30
|
+
end
|
31
|
+
|
32
|
+
def plain_text
|
33
|
+
@text.pack('U*')
|
34
|
+
end
|
35
|
+
|
36
|
+
def adjust_offsets(plain_text_offsets)
|
37
|
+
plain_text_offsets = plain_text_offsets.is_a?(Offsets) ?
|
38
|
+
plain_text_offsets :
|
39
|
+
Offsets.new(plain_text_offsets)
|
40
|
+
adjusted_offsets = TagAlong::Offsets.new([])
|
41
|
+
@offsets.each do |offset|
|
42
|
+
next if offset[:type] == :tag
|
43
|
+
process_offset(plain_text_offsets, offset, adjusted_offsets)
|
44
|
+
break if plain_text_offsets.empty?
|
45
|
+
end
|
46
|
+
adjusted_offsets
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
def process_offset(plain_text_offsets, offset, adjusted_offsets)
|
53
|
+
o = plain_text_offsets[0]
|
54
|
+
|
55
|
+
return if o.offset_start > offset[:text_start]
|
56
|
+
unless o.adj_start
|
57
|
+
delta = o.offset_start - offset[:text_start]
|
58
|
+
o.adj_start = offset[:start] + delta
|
59
|
+
end
|
60
|
+
|
61
|
+
if o.offset_end <= offset[:text_end]
|
62
|
+
delta = o.offset_end - offset[:text_end]
|
63
|
+
o = plain_text_offsets.shift
|
64
|
+
o.offset_start = o.delete_field(:adj_start)
|
65
|
+
o.offset_end = offset[:end] + delta
|
66
|
+
adjusted_offsets << o
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def process_tagged_text
|
71
|
+
opts = { count: 0, chr: nil }
|
72
|
+
while opts[:chr] = tagged_text_ary.shift
|
73
|
+
@inside_tag ? process_inside_tag(opts) : process_outside_tag(opts)
|
74
|
+
opts[:count] += 1
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def tagged_text_ary
|
79
|
+
@tagged_text_ary ||= @tagged_text.unpack('U*')
|
80
|
+
end
|
81
|
+
|
82
|
+
def process_outside_tag(opts)
|
83
|
+
if opts[:chr] == CHR['<']
|
84
|
+
@inside_tag = true
|
85
|
+
if opts[:count] > 0
|
86
|
+
@current_offset[:end] = opts[:count] - 1
|
87
|
+
@current_offset[:text_end] = @text_offset - 1
|
88
|
+
@offsets << @current_offset
|
89
|
+
end
|
90
|
+
@current_offset = { type: :tag, start: opts[:count], end: nil }
|
91
|
+
else
|
92
|
+
process_text(opts)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def process_inside_tag(opts)
|
97
|
+
if opts[:chr] == CHR['>']
|
98
|
+
@inside_tag = false
|
99
|
+
@current_offset[:end] = opts[:count]
|
100
|
+
@offsets << @current_offset
|
101
|
+
@current_offset = { type: :text, start: opts[:count] + 1, end: nil,
|
102
|
+
text_start: @text_offset, text_end: nil }
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def process_text(opts)
|
107
|
+
if @normalize_spaces
|
108
|
+
process_normalized_spaces_text(opts)
|
109
|
+
else
|
110
|
+
add_to_text(opts)
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def add_to_text(opts)
|
115
|
+
@text_offset += 1
|
116
|
+
@text << opts[:chr]
|
117
|
+
end
|
118
|
+
|
119
|
+
def process_normalized_spaces_text(opts)
|
120
|
+
@inside_space ? process_inside_space(opts) : process_outside_space(opts)
|
121
|
+
end
|
122
|
+
|
123
|
+
def process_inside_space(opts)
|
124
|
+
#TODO
|
125
|
+
end
|
126
|
+
|
127
|
+
def process_outside_space(opts)
|
128
|
+
#TODO
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
data/lib/tag_along/version.rb
CHANGED
data/lib/tag_along.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -29,5 +29,18 @@ unless defined?(SPEC_VARS)
|
|
29
29
|
FILES_DIR = File.expand_path(File.join(File.dirname(__FILE__), 'files'))
|
30
30
|
TEXT, OFFSETS_ARY, OFFSETS_HASH, OFFSETS_OBJ =
|
31
31
|
TagAlongSpec.process_spec_data(FILES_DIR)
|
32
|
+
HTML_TEXT = "
|
33
|
+
<html>
|
34
|
+
<head>
|
35
|
+
<title>Days of the week</title>
|
36
|
+
</head>
|
37
|
+
<body>
|
38
|
+
<p>
|
39
|
+
There's <strong>Sunday</strong>
|
40
|
+
and there's <strong>Monday</strong>
|
41
|
+
</p>
|
42
|
+
</body>
|
43
|
+
</html>
|
44
|
+
"
|
32
45
|
SPEC_VARS = true
|
33
46
|
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require_relative '../spec_helper'
|
2
|
+
|
3
|
+
describe TagAlong::TaggedText do
|
4
|
+
|
5
|
+
describe 'No space normalization' do
|
6
|
+
|
7
|
+
let(:text) do
|
8
|
+
"\n \n \n Days of the week" +
|
9
|
+
"\n \n \n \n There's" +
|
10
|
+
" Sunday\n and there's Monday" +
|
11
|
+
"\n \n \n \n "
|
12
|
+
end
|
13
|
+
|
14
|
+
let(:text_offsets) { [[77,82],[104,109]] }
|
15
|
+
|
16
|
+
subject { TagAlong::TaggedText.new(HTML_TEXT) }
|
17
|
+
|
18
|
+
its(:tagged_text) { should == HTML_TEXT }
|
19
|
+
its(:plain_text) { should == text }
|
20
|
+
it 'should get offsets' do
|
21
|
+
subject.offsets[1].should == { type: :tag , start: 5, end: 10 }
|
22
|
+
subject.offsets[2].should == { type: :text , start: 11,
|
23
|
+
end: 17, text_start: 5, text_end: 11 }
|
24
|
+
end
|
25
|
+
|
26
|
+
it 'should adjust offsets' do
|
27
|
+
text_offsets.should == [[77,82],[104,109]]
|
28
|
+
offsets = subject.adjust_offsets(text_offsets)
|
29
|
+
offsets.should be_kind_of TagAlong::Offsets
|
30
|
+
offsets.map { |o| [o.offset_start, o.offset_end] }.should ==
|
31
|
+
[[128, 133], [172, 177]]
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
|
36
|
+
describe 'space normalization' do
|
37
|
+
let(:text) { "Days of the week There's Sunday and there's Monday" }
|
38
|
+
|
39
|
+
subject { TagAlong::TaggedText.new(HTML_TEXT, normalize_spaces: true) }
|
40
|
+
|
41
|
+
its(:tagged_text) { should == HTML_TEXT }
|
42
|
+
its(:plain_text) { should == text }
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tag_along
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.8.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dmitry Mozzherin
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-08 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |-
|
14
14
|
Tags a text with arbitrary tags
|
@@ -29,10 +29,12 @@ files:
|
|
29
29
|
- Rakefile
|
30
30
|
- lib/tag_along.rb
|
31
31
|
- lib/tag_along/offsets.rb
|
32
|
+
- lib/tag_along/tagged_text.rb
|
32
33
|
- lib/tag_along/version.rb
|
33
34
|
- spec/files/spec_data.json
|
34
35
|
- spec/spec_helper.rb
|
35
36
|
- spec/tag_along/offsets_spec.rb
|
37
|
+
- spec/tag_along/tagged_text_spec.rb
|
36
38
|
- spec/tag_along_spec.rb
|
37
39
|
- tag_along.gemspec
|
38
40
|
homepage: https://github.com/GlobalNamesArchitecture/tag_along
|
@@ -65,4 +67,5 @@ test_files:
|
|
65
67
|
- spec/files/spec_data.json
|
66
68
|
- spec/spec_helper.rb
|
67
69
|
- spec/tag_along/offsets_spec.rb
|
70
|
+
- spec/tag_along/tagged_text_spec.rb
|
68
71
|
- spec/tag_along_spec.rb
|