tag_along 0.7.3 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3406f54328de2eda446a7267a7a8f40440a67866
4
- data.tar.gz: 2281e8cebc82ba0287920ac3f41c5b302c1f63fa
3
+ metadata.gz: 655afbd01a5e5232292d7795d5623182ac903813
4
+ data.tar.gz: 85eda2423fe4919d01e09e168cdccb7f0dc4ea8c
5
5
  SHA512:
6
- metadata.gz: 1420f52f09c7d5260abe8ebd6f9cdec024dabf10d3443ff9977149a7ab1cf9473ac1021d713ee9c816fcfff4fe0a2230da3e0875a4f3de7b97f445eae3138916
7
- data.tar.gz: ac5192be2da0d9264052945a7b02e0ba1d89470822c711b52ad3893f15d19701b48511016e3c2f9188ad32f9c43a23554c2051d0bacb0216d0ae45154a4fe617
6
+ metadata.gz: d0c4911a2bbc2d4d515d77bdede1064bb0a1e717c519307b22e247799f742fdff92577bfa9d4eb988f7e8bf19ec7eb107d025c613c5229ec958f4de4fe7fd8c0
7
+ data.tar.gz: 32cc83d07e519ebdf9f7b77820cf1da8506feb0c7f5c6e25434b252b4291fa82b70440ddeb1470f8270e16b657d7fcf6f313f49173dc1367afde9575aa8b8330
data/.gitignore CHANGED
@@ -1,6 +1,7 @@
1
1
  *.gem
2
2
  *.rbc
3
3
  .bundle
4
+ bundle_bin
4
5
  .config
5
6
  .yardoc
6
7
  Gemfile.lock
data/CHANGELOG CHANGED
@@ -1,3 +1,6 @@
1
+ 0.8.0 -- getting plain text from html/xml and readjusting offsets
2
+ to full version
3
+
1
4
  0.7.3 -- removed junk files plus refactoring
2
5
 
3
6
  0.7.2 -- end of the text preserved
data/Gemfile CHANGED
@@ -1,6 +1,7 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
3
  gem 'json', '~> 1.7'
4
+ gem 'treetop', '~> 1.4'
4
5
 
5
6
  group :development do
6
7
  gem 'bundler', '~> 1.3'
data/README.md CHANGED
@@ -53,6 +53,34 @@ To add tags to a text:
53
53
 
54
54
  Notice that you can retag the text as many times as you want.
55
55
 
56
+ ### Tagging html converted to plain text
57
+
58
+ Sometimes it is necessary to convert html text to plain text to make sure that
59
+ search items are not separated by html tags. For situations like this gem
60
+ creates an intermediary structure which 'remembers' position of html tags and
61
+ recreates correct offsets from offsets of the plain text.
62
+
63
+ html_text = "
64
+ <html>
65
+ <head>
66
+ <title>Days of the week</title>
67
+ </head>
68
+ <body>
69
+ <p>
70
+ There's <strong>Sunday</strong>
71
+ and there's <strong>Monday</strong>
72
+ </p>
73
+ </body>
74
+ </html>
75
+ "
76
+ tt = TagAlong::TaggedText.new(html_text)
77
+ text = hc.plain_text
78
+ # returns "There's Sunday and there's Monday" with lots of space junk
79
+ text_offsets = [[8, 13], [27, 32]]
80
+ html_offsets = hc.adjust_offsets(text_offsets)
81
+ tg = TagAlong.new(html_text, html_offsets)
82
+ tg.tag('<my_tag>', '</my_tag>')
83
+
56
84
  ### Dynamic tags
57
85
 
58
86
  Sometimes tags contain changeable component. To add dynamic data to tags:
@@ -10,13 +10,13 @@ class TagAlong
10
10
  @offset_end = (opts[:offset_end] || 'offset_end').to_sym
11
11
  @data_start = (opts[:data_start] || 'data_start').to_sym
12
12
  @data_end = (opts[:data_end] || 'data_end').to_sym
13
-
13
+
14
14
  item = @offsets.first
15
15
  if item.is_a?(Array)
16
16
  process_array
17
17
  elsif item.is_a?(Hash)
18
18
  process_hash
19
- else
19
+ else
20
20
  process_obj
21
21
  end
22
22
  @offsets.sort_by!(&:offset_start)
@@ -28,6 +28,25 @@ class TagAlong
28
28
  end
29
29
  end
30
30
 
31
+ def [](num)
32
+ @offsets[num]
33
+ end
34
+
35
+ def shift
36
+ @offsets.shift
37
+ end
38
+
39
+ def empty?
40
+ @offsets.empty?
41
+ end
42
+
43
+ def << offset
44
+ unless offset.respond_to?(:offset_start) && offset.respond_to?(:offset_end)
45
+ raise TypeError.new('Object does not match Offset signature')
46
+ end
47
+ @offsets << offset
48
+ end
49
+
31
50
  private
32
51
 
33
52
  def process_array
@@ -36,17 +55,17 @@ class TagAlong
36
55
  offset_end = o[1]
37
56
  data_start = o[2]
38
57
  data_end = o[3]
39
- instantiate(offset_start, offset_end, data_start, data_end)
58
+ instantiate(offset_start, offset_end, data_start, data_end)
40
59
  end
41
60
  end
42
61
 
43
62
  def process_hash
44
63
  @offsets.each { |h| symbolize_keys(h) }
45
64
  @offsets = @offsets.map do |h|
46
- instantiate(h[@offset_start],
65
+ instantiate(h[@offset_start],
47
66
  h[@offset_end],
48
67
  h[@data_start],
49
- h[@data_end])
68
+ h[@data_end])
50
69
  end
51
70
  end
52
71
 
@@ -56,10 +75,10 @@ class TagAlong
56
75
  offset_end = obj.send(@offset_end)
57
76
  data_start = obj.send(@data_start)
58
77
  data_end = obj.send(@data_end)
59
- instantiate(offset_start, offset_end, data_start, data_end)
78
+ instantiate(offset_start, offset_end, data_start, data_end)
60
79
  end
61
80
  end
62
-
81
+
63
82
  def instantiate(offset_start, offset_end, data_start = nil, data_end = nil)
64
83
  data_start = data_to_ary(data_start)
65
84
  data_end = data_to_ary(data_end)
@@ -85,7 +104,7 @@ class TagAlong
85
104
  a_hash[(key.to_sym rescue key) || key] = a_hash.delete(key)
86
105
  end
87
106
  end
88
-
107
+
89
108
  end
90
109
 
91
110
  end
@@ -0,0 +1,132 @@
1
+ require 'polyglot'
2
+ require 'treetop'
3
+
4
+ class TagAlong
5
+
6
+ class TaggedText
7
+ CHR = { '<' => 60, '>' => 62 }
8
+ SPACES = { 9 => true, 10 => true, 11 => true, 12 => true,
9
+ 13 => true, 32 => true, 133 => true, 160 => true,
10
+ 5760 => true, 6158 => true, 8192 => true,
11
+ 8193 => true, 8194 => true, 8195 => true,
12
+ 8196 => true, 8197 => true, 8198 => true,
13
+ 8199 => true, 8200 => true, 8201 => true,
14
+ 8202 => true, 8232 => true, 8233 => true,
15
+ 8239 => true, 8287 => true, 12288 => true }
16
+
17
+ attr_reader :tagged_text, :offsets
18
+
19
+ def initialize(tagged_text, opts = {})
20
+ @normalize_spaces = true if opts[:normalize_spaces]
21
+ @tagged_text = tagged_text
22
+ @inside_tag = false
23
+ @inside_space = false
24
+ @offsets = []
25
+ @text = []
26
+ @text_offset = 0
27
+ @current_offset = { type: :text, start: 0, end: nil,
28
+ text_start: 0, text_end: nil }
29
+ process_tagged_text
30
+ end
31
+
32
+ def plain_text
33
+ @text.pack('U*')
34
+ end
35
+
36
+ def adjust_offsets(plain_text_offsets)
37
+ plain_text_offsets = plain_text_offsets.is_a?(Offsets) ?
38
+ plain_text_offsets :
39
+ Offsets.new(plain_text_offsets)
40
+ adjusted_offsets = TagAlong::Offsets.new([])
41
+ @offsets.each do |offset|
42
+ next if offset[:type] == :tag
43
+ process_offset(plain_text_offsets, offset, adjusted_offsets)
44
+ break if plain_text_offsets.empty?
45
+ end
46
+ adjusted_offsets
47
+ end
48
+
49
+
50
+ private
51
+
52
+ def process_offset(plain_text_offsets, offset, adjusted_offsets)
53
+ o = plain_text_offsets[0]
54
+
55
+ return if o.offset_start > offset[:text_start]
56
+ unless o.adj_start
57
+ delta = o.offset_start - offset[:text_start]
58
+ o.adj_start = offset[:start] + delta
59
+ end
60
+
61
+ if o.offset_end <= offset[:text_end]
62
+ delta = o.offset_end - offset[:text_end]
63
+ o = plain_text_offsets.shift
64
+ o.offset_start = o.delete_field(:adj_start)
65
+ o.offset_end = offset[:end] + delta
66
+ adjusted_offsets << o
67
+ end
68
+ end
69
+
70
+ def process_tagged_text
71
+ opts = { count: 0, chr: nil }
72
+ while opts[:chr] = tagged_text_ary.shift
73
+ @inside_tag ? process_inside_tag(opts) : process_outside_tag(opts)
74
+ opts[:count] += 1
75
+ end
76
+ end
77
+
78
+ def tagged_text_ary
79
+ @tagged_text_ary ||= @tagged_text.unpack('U*')
80
+ end
81
+
82
+ def process_outside_tag(opts)
83
+ if opts[:chr] == CHR['<']
84
+ @inside_tag = true
85
+ if opts[:count] > 0
86
+ @current_offset[:end] = opts[:count] - 1
87
+ @current_offset[:text_end] = @text_offset - 1
88
+ @offsets << @current_offset
89
+ end
90
+ @current_offset = { type: :tag, start: opts[:count], end: nil }
91
+ else
92
+ process_text(opts)
93
+ end
94
+ end
95
+
96
+ def process_inside_tag(opts)
97
+ if opts[:chr] == CHR['>']
98
+ @inside_tag = false
99
+ @current_offset[:end] = opts[:count]
100
+ @offsets << @current_offset
101
+ @current_offset = { type: :text, start: opts[:count] + 1, end: nil,
102
+ text_start: @text_offset, text_end: nil }
103
+ end
104
+ end
105
+
106
+ def process_text(opts)
107
+ if @normalize_spaces
108
+ process_normalized_spaces_text(opts)
109
+ else
110
+ add_to_text(opts)
111
+ end
112
+ end
113
+
114
+ def add_to_text(opts)
115
+ @text_offset += 1
116
+ @text << opts[:chr]
117
+ end
118
+
119
+ def process_normalized_spaces_text(opts)
120
+ @inside_space ? process_inside_space(opts) : process_outside_space(opts)
121
+ end
122
+
123
+ def process_inside_space(opts)
124
+ #TODO
125
+ end
126
+
127
+ def process_outside_space(opts)
128
+ #TODO
129
+ end
130
+ end
131
+ end
132
+
@@ -1,3 +1,3 @@
1
1
  class TagAlong
2
- VERSION = '0.7.3'
2
+ VERSION = '0.8.0'
3
3
  end
data/lib/tag_along.rb CHANGED
@@ -1,10 +1,10 @@
1
1
  require 'ostruct'
2
2
  require 'tag_along/version'
3
3
  require 'tag_along/offsets'
4
+ require 'tag_along/tagged_text'
4
5
 
5
6
  class TagAlong
6
-
7
- attr :text, :tagged_text
7
+ attr :text, :tagged_text
8
8
 
9
9
  def self.version
10
10
  VERSION
data/spec/spec_helper.rb CHANGED
@@ -29,5 +29,18 @@ unless defined?(SPEC_VARS)
29
29
  FILES_DIR = File.expand_path(File.join(File.dirname(__FILE__), 'files'))
30
30
  TEXT, OFFSETS_ARY, OFFSETS_HASH, OFFSETS_OBJ =
31
31
  TagAlongSpec.process_spec_data(FILES_DIR)
32
+ HTML_TEXT = "
33
+ <html>
34
+ <head>
35
+ <title>Days of the week</title>
36
+ </head>
37
+ <body>
38
+ <p>
39
+ There's <strong>Sunday</strong>
40
+ and there's <strong>Monday</strong>
41
+ </p>
42
+ </body>
43
+ </html>
44
+ "
32
45
  SPEC_VARS = true
33
46
  end
@@ -0,0 +1,46 @@
1
+ require_relative '../spec_helper'
2
+
3
+ describe TagAlong::TaggedText do
4
+
5
+ describe 'No space normalization' do
6
+
7
+ let(:text) do
8
+ "\n \n \n Days of the week" +
9
+ "\n \n \n \n There's" +
10
+ " Sunday\n and there's Monday" +
11
+ "\n \n \n \n "
12
+ end
13
+
14
+ let(:text_offsets) { [[77,82],[104,109]] }
15
+
16
+ subject { TagAlong::TaggedText.new(HTML_TEXT) }
17
+
18
+ its(:tagged_text) { should == HTML_TEXT }
19
+ its(:plain_text) { should == text }
20
+ it 'should get offsets' do
21
+ subject.offsets[1].should == { type: :tag , start: 5, end: 10 }
22
+ subject.offsets[2].should == { type: :text , start: 11,
23
+ end: 17, text_start: 5, text_end: 11 }
24
+ end
25
+
26
+ it 'should adjust offsets' do
27
+ text_offsets.should == [[77,82],[104,109]]
28
+ offsets = subject.adjust_offsets(text_offsets)
29
+ offsets.should be_kind_of TagAlong::Offsets
30
+ offsets.map { |o| [o.offset_start, o.offset_end] }.should ==
31
+ [[128, 133], [172, 177]]
32
+ end
33
+
34
+ end
35
+
36
+ describe 'space normalization' do
37
+ let(:text) { "Days of the week There's Sunday and there's Monday" }
38
+
39
+ subject { TagAlong::TaggedText.new(HTML_TEXT, normalize_spaces: true) }
40
+
41
+ its(:tagged_text) { should == HTML_TEXT }
42
+ its(:plain_text) { should == text }
43
+ end
44
+
45
+ end
46
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tag_along
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.3
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-12 00:00:00.000000000 Z
11
+ date: 2013-10-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: |-
14
14
  Tags a text with arbitrary tags
@@ -29,10 +29,12 @@ files:
29
29
  - Rakefile
30
30
  - lib/tag_along.rb
31
31
  - lib/tag_along/offsets.rb
32
+ - lib/tag_along/tagged_text.rb
32
33
  - lib/tag_along/version.rb
33
34
  - spec/files/spec_data.json
34
35
  - spec/spec_helper.rb
35
36
  - spec/tag_along/offsets_spec.rb
37
+ - spec/tag_along/tagged_text_spec.rb
36
38
  - spec/tag_along_spec.rb
37
39
  - tag_along.gemspec
38
40
  homepage: https://github.com/GlobalNamesArchitecture/tag_along
@@ -65,4 +67,5 @@ test_files:
65
67
  - spec/files/spec_data.json
66
68
  - spec/spec_helper.rb
67
69
  - spec/tag_along/offsets_spec.rb
70
+ - spec/tag_along/tagged_text_spec.rb
68
71
  - spec/tag_along_spec.rb