tag_along 0.7.3 → 0.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 3406f54328de2eda446a7267a7a8f40440a67866
4
- data.tar.gz: 2281e8cebc82ba0287920ac3f41c5b302c1f63fa
3
+ metadata.gz: 655afbd01a5e5232292d7795d5623182ac903813
4
+ data.tar.gz: 85eda2423fe4919d01e09e168cdccb7f0dc4ea8c
5
5
  SHA512:
6
- metadata.gz: 1420f52f09c7d5260abe8ebd6f9cdec024dabf10d3443ff9977149a7ab1cf9473ac1021d713ee9c816fcfff4fe0a2230da3e0875a4f3de7b97f445eae3138916
7
- data.tar.gz: ac5192be2da0d9264052945a7b02e0ba1d89470822c711b52ad3893f15d19701b48511016e3c2f9188ad32f9c43a23554c2051d0bacb0216d0ae45154a4fe617
6
+ metadata.gz: d0c4911a2bbc2d4d515d77bdede1064bb0a1e717c519307b22e247799f742fdff92577bfa9d4eb988f7e8bf19ec7eb107d025c613c5229ec958f4de4fe7fd8c0
7
+ data.tar.gz: 32cc83d07e519ebdf9f7b77820cf1da8506feb0c7f5c6e25434b252b4291fa82b70440ddeb1470f8270e16b657d7fcf6f313f49173dc1367afde9575aa8b8330
data/.gitignore CHANGED
@@ -1,6 +1,7 @@
1
1
  *.gem
2
2
  *.rbc
3
3
  .bundle
4
+ bundle_bin
4
5
  .config
5
6
  .yardoc
6
7
  Gemfile.lock
data/CHANGELOG CHANGED
@@ -1,3 +1,6 @@
1
+ 0.8.0 -- getting plain text from html/xml and readjusting offsets
2
+ to full version
3
+
1
4
  0.7.3 -- removed junk files plus refactoring
2
5
 
3
6
  0.7.2 -- end of the text preserved
data/Gemfile CHANGED
@@ -1,6 +1,7 @@
1
1
  source 'https://rubygems.org'
2
2
 
3
3
  gem 'json', '~> 1.7'
4
+ gem 'treetop', '~> 1.4'
4
5
 
5
6
  group :development do
6
7
  gem 'bundler', '~> 1.3'
data/README.md CHANGED
@@ -53,6 +53,34 @@ To add tags to a text:
53
53
 
54
54
  Notice that you can retag the text as many times as you want.
55
55
 
56
+ ### Tagging html converted to plain text
57
+
58
+ Sometimes it is necessary to convert html text to plain text to make sure that
59
+ search items are not separated by html tags. For situations like this gem
60
+ creates an intermediary structure which 'remembers' position of html tags and
61
+ recreates correct offsets from offsets of the plain text.
62
+
63
+ html_text = "
64
+ <html>
65
+ <head>
66
+ <title>Days of the week</title>
67
+ </head>
68
+ <body>
69
+ <p>
70
+ There's <strong>Sunday</strong>
71
+ and there's <strong>Monday</strong>
72
+ </p>
73
+ </body>
74
+ </html>
75
+ "
76
+ tt = TagAlong::TaggedText.new(html_text)
77
+ text = hc.plain_text
78
+ # returns "There's Sunday and there's Monday" with lots of space junk
79
+ text_offsets = [[8, 13], [27, 32]]
80
+ html_offsets = hc.adjust_offsets(text_offsets)
81
+ tg = TagAlong.new(html_text, html_offsets)
82
+ tg.tag('<my_tag>', '</my_tag>')
83
+
56
84
  ### Dynamic tags
57
85
 
58
86
  Sometimes tags contain changeable component. To add dynamic data to tags:
@@ -10,13 +10,13 @@ class TagAlong
10
10
  @offset_end = (opts[:offset_end] || 'offset_end').to_sym
11
11
  @data_start = (opts[:data_start] || 'data_start').to_sym
12
12
  @data_end = (opts[:data_end] || 'data_end').to_sym
13
-
13
+
14
14
  item = @offsets.first
15
15
  if item.is_a?(Array)
16
16
  process_array
17
17
  elsif item.is_a?(Hash)
18
18
  process_hash
19
- else
19
+ else
20
20
  process_obj
21
21
  end
22
22
  @offsets.sort_by!(&:offset_start)
@@ -28,6 +28,25 @@ class TagAlong
28
28
  end
29
29
  end
30
30
 
31
+ def [](num)
32
+ @offsets[num]
33
+ end
34
+
35
+ def shift
36
+ @offsets.shift
37
+ end
38
+
39
+ def empty?
40
+ @offsets.empty?
41
+ end
42
+
43
+ def << offset
44
+ unless offset.respond_to?(:offset_start) && offset.respond_to?(:offset_end)
45
+ raise TypeError.new('Object does not match Offset signature')
46
+ end
47
+ @offsets << offset
48
+ end
49
+
31
50
  private
32
51
 
33
52
  def process_array
@@ -36,17 +55,17 @@ class TagAlong
36
55
  offset_end = o[1]
37
56
  data_start = o[2]
38
57
  data_end = o[3]
39
- instantiate(offset_start, offset_end, data_start, data_end)
58
+ instantiate(offset_start, offset_end, data_start, data_end)
40
59
  end
41
60
  end
42
61
 
43
62
  def process_hash
44
63
  @offsets.each { |h| symbolize_keys(h) }
45
64
  @offsets = @offsets.map do |h|
46
- instantiate(h[@offset_start],
65
+ instantiate(h[@offset_start],
47
66
  h[@offset_end],
48
67
  h[@data_start],
49
- h[@data_end])
68
+ h[@data_end])
50
69
  end
51
70
  end
52
71
 
@@ -56,10 +75,10 @@ class TagAlong
56
75
  offset_end = obj.send(@offset_end)
57
76
  data_start = obj.send(@data_start)
58
77
  data_end = obj.send(@data_end)
59
- instantiate(offset_start, offset_end, data_start, data_end)
78
+ instantiate(offset_start, offset_end, data_start, data_end)
60
79
  end
61
80
  end
62
-
81
+
63
82
  def instantiate(offset_start, offset_end, data_start = nil, data_end = nil)
64
83
  data_start = data_to_ary(data_start)
65
84
  data_end = data_to_ary(data_end)
@@ -85,7 +104,7 @@ class TagAlong
85
104
  a_hash[(key.to_sym rescue key) || key] = a_hash.delete(key)
86
105
  end
87
106
  end
88
-
107
+
89
108
  end
90
109
 
91
110
  end
@@ -0,0 +1,132 @@
1
+ require 'polyglot'
2
+ require 'treetop'
3
+
4
+ class TagAlong
5
+
6
+ class TaggedText
7
+ CHR = { '<' => 60, '>' => 62 }
8
+ SPACES = { 9 => true, 10 => true, 11 => true, 12 => true,
9
+ 13 => true, 32 => true, 133 => true, 160 => true,
10
+ 5760 => true, 6158 => true, 8192 => true,
11
+ 8193 => true, 8194 => true, 8195 => true,
12
+ 8196 => true, 8197 => true, 8198 => true,
13
+ 8199 => true, 8200 => true, 8201 => true,
14
+ 8202 => true, 8232 => true, 8233 => true,
15
+ 8239 => true, 8287 => true, 12288 => true }
16
+
17
+ attr_reader :tagged_text, :offsets
18
+
19
+ def initialize(tagged_text, opts = {})
20
+ @normalize_spaces = true if opts[:normalize_spaces]
21
+ @tagged_text = tagged_text
22
+ @inside_tag = false
23
+ @inside_space = false
24
+ @offsets = []
25
+ @text = []
26
+ @text_offset = 0
27
+ @current_offset = { type: :text, start: 0, end: nil,
28
+ text_start: 0, text_end: nil }
29
+ process_tagged_text
30
+ end
31
+
32
+ def plain_text
33
+ @text.pack('U*')
34
+ end
35
+
36
+ def adjust_offsets(plain_text_offsets)
37
+ plain_text_offsets = plain_text_offsets.is_a?(Offsets) ?
38
+ plain_text_offsets :
39
+ Offsets.new(plain_text_offsets)
40
+ adjusted_offsets = TagAlong::Offsets.new([])
41
+ @offsets.each do |offset|
42
+ next if offset[:type] == :tag
43
+ process_offset(plain_text_offsets, offset, adjusted_offsets)
44
+ break if plain_text_offsets.empty?
45
+ end
46
+ adjusted_offsets
47
+ end
48
+
49
+
50
+ private
51
+
52
+ def process_offset(plain_text_offsets, offset, adjusted_offsets)
53
+ o = plain_text_offsets[0]
54
+
55
+ return if o.offset_start > offset[:text_start]
56
+ unless o.adj_start
57
+ delta = o.offset_start - offset[:text_start]
58
+ o.adj_start = offset[:start] + delta
59
+ end
60
+
61
+ if o.offset_end <= offset[:text_end]
62
+ delta = o.offset_end - offset[:text_end]
63
+ o = plain_text_offsets.shift
64
+ o.offset_start = o.delete_field(:adj_start)
65
+ o.offset_end = offset[:end] + delta
66
+ adjusted_offsets << o
67
+ end
68
+ end
69
+
70
+ def process_tagged_text
71
+ opts = { count: 0, chr: nil }
72
+ while opts[:chr] = tagged_text_ary.shift
73
+ @inside_tag ? process_inside_tag(opts) : process_outside_tag(opts)
74
+ opts[:count] += 1
75
+ end
76
+ end
77
+
78
+ def tagged_text_ary
79
+ @tagged_text_ary ||= @tagged_text.unpack('U*')
80
+ end
81
+
82
+ def process_outside_tag(opts)
83
+ if opts[:chr] == CHR['<']
84
+ @inside_tag = true
85
+ if opts[:count] > 0
86
+ @current_offset[:end] = opts[:count] - 1
87
+ @current_offset[:text_end] = @text_offset - 1
88
+ @offsets << @current_offset
89
+ end
90
+ @current_offset = { type: :tag, start: opts[:count], end: nil }
91
+ else
92
+ process_text(opts)
93
+ end
94
+ end
95
+
96
+ def process_inside_tag(opts)
97
+ if opts[:chr] == CHR['>']
98
+ @inside_tag = false
99
+ @current_offset[:end] = opts[:count]
100
+ @offsets << @current_offset
101
+ @current_offset = { type: :text, start: opts[:count] + 1, end: nil,
102
+ text_start: @text_offset, text_end: nil }
103
+ end
104
+ end
105
+
106
+ def process_text(opts)
107
+ if @normalize_spaces
108
+ process_normalized_spaces_text(opts)
109
+ else
110
+ add_to_text(opts)
111
+ end
112
+ end
113
+
114
+ def add_to_text(opts)
115
+ @text_offset += 1
116
+ @text << opts[:chr]
117
+ end
118
+
119
+ def process_normalized_spaces_text(opts)
120
+ @inside_space ? process_inside_space(opts) : process_outside_space(opts)
121
+ end
122
+
123
+ def process_inside_space(opts)
124
+ #TODO
125
+ end
126
+
127
+ def process_outside_space(opts)
128
+ #TODO
129
+ end
130
+ end
131
+ end
132
+
@@ -1,3 +1,3 @@
1
1
  class TagAlong
2
- VERSION = '0.7.3'
2
+ VERSION = '0.8.0'
3
3
  end
data/lib/tag_along.rb CHANGED
@@ -1,10 +1,10 @@
1
1
  require 'ostruct'
2
2
  require 'tag_along/version'
3
3
  require 'tag_along/offsets'
4
+ require 'tag_along/tagged_text'
4
5
 
5
6
  class TagAlong
6
-
7
- attr :text, :tagged_text
7
+ attr :text, :tagged_text
8
8
 
9
9
  def self.version
10
10
  VERSION
data/spec/spec_helper.rb CHANGED
@@ -29,5 +29,18 @@ unless defined?(SPEC_VARS)
29
29
  FILES_DIR = File.expand_path(File.join(File.dirname(__FILE__), 'files'))
30
30
  TEXT, OFFSETS_ARY, OFFSETS_HASH, OFFSETS_OBJ =
31
31
  TagAlongSpec.process_spec_data(FILES_DIR)
32
+ HTML_TEXT = "
33
+ <html>
34
+ <head>
35
+ <title>Days of the week</title>
36
+ </head>
37
+ <body>
38
+ <p>
39
+ There's <strong>Sunday</strong>
40
+ and there's <strong>Monday</strong>
41
+ </p>
42
+ </body>
43
+ </html>
44
+ "
32
45
  SPEC_VARS = true
33
46
  end
@@ -0,0 +1,46 @@
1
+ require_relative '../spec_helper'
2
+
3
+ describe TagAlong::TaggedText do
4
+
5
+ describe 'No space normalization' do
6
+
7
+ let(:text) do
8
+ "\n \n \n Days of the week" +
9
+ "\n \n \n \n There's" +
10
+ " Sunday\n and there's Monday" +
11
+ "\n \n \n \n "
12
+ end
13
+
14
+ let(:text_offsets) { [[77,82],[104,109]] }
15
+
16
+ subject { TagAlong::TaggedText.new(HTML_TEXT) }
17
+
18
+ its(:tagged_text) { should == HTML_TEXT }
19
+ its(:plain_text) { should == text }
20
+ it 'should get offsets' do
21
+ subject.offsets[1].should == { type: :tag , start: 5, end: 10 }
22
+ subject.offsets[2].should == { type: :text , start: 11,
23
+ end: 17, text_start: 5, text_end: 11 }
24
+ end
25
+
26
+ it 'should adjust offsets' do
27
+ text_offsets.should == [[77,82],[104,109]]
28
+ offsets = subject.adjust_offsets(text_offsets)
29
+ offsets.should be_kind_of TagAlong::Offsets
30
+ offsets.map { |o| [o.offset_start, o.offset_end] }.should ==
31
+ [[128, 133], [172, 177]]
32
+ end
33
+
34
+ end
35
+
36
+ describe 'space normalization' do
37
+ let(:text) { "Days of the week There's Sunday and there's Monday" }
38
+
39
+ subject { TagAlong::TaggedText.new(HTML_TEXT, normalize_spaces: true) }
40
+
41
+ its(:tagged_text) { should == HTML_TEXT }
42
+ its(:plain_text) { should == text }
43
+ end
44
+
45
+ end
46
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tag_along
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.3
4
+ version: 0.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dmitry Mozzherin
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-09-12 00:00:00.000000000 Z
11
+ date: 2013-10-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: |-
14
14
  Tags a text with arbitrary tags
@@ -29,10 +29,12 @@ files:
29
29
  - Rakefile
30
30
  - lib/tag_along.rb
31
31
  - lib/tag_along/offsets.rb
32
+ - lib/tag_along/tagged_text.rb
32
33
  - lib/tag_along/version.rb
33
34
  - spec/files/spec_data.json
34
35
  - spec/spec_helper.rb
35
36
  - spec/tag_along/offsets_spec.rb
37
+ - spec/tag_along/tagged_text_spec.rb
36
38
  - spec/tag_along_spec.rb
37
39
  - tag_along.gemspec
38
40
  homepage: https://github.com/GlobalNamesArchitecture/tag_along
@@ -65,4 +67,5 @@ test_files:
65
67
  - spec/files/spec_data.json
66
68
  - spec/spec_helper.rb
67
69
  - spec/tag_along/offsets_spec.rb
70
+ - spec/tag_along/tagged_text_spec.rb
68
71
  - spec/tag_along_spec.rb