pertinent_parser 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/pertinent_parser.rb +78 -0
- data/lib/pertinent_parser/rule.rb +79 -0
- data/lib/pertinent_parser/text.rb +27 -0
- data/lib/pertinent_parser/transform.rb +25 -0
- metadata +61 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 82c1ea5ccb406cf088548e021a771378c318e3d1
|
4
|
+
data.tar.gz: 9b89619ed72dda0008192856ea343f624b0573b5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e3d3d6f55364d41ab21ebce1e48a9d2e2b84aaf737afb7b8390a599bbd206dff1ea72874772f9e5dadb340efd794b0b3c62877540915529ff142a406e839ae37
|
7
|
+
data.tar.gz: 8238784a074f077775b6d8df61c792a77b5ff0f2e8f8b23c8418f3d4eecd5f7ad15edd4059374744240b8dac1445ac85090a5503e79a603fa1f0da5f9cd54073
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require "hpricot"
|
2
|
+
require "pertinent_parser/transform"
|
3
|
+
require "pertinent_parser/rule"
|
4
|
+
require "pertinent_parser/text"
|
5
|
+
|
6
|
+
class Hpricot::Elem
|
7
|
+
def stag
|
8
|
+
"<#{name}#{attributes_as_html}" +
|
9
|
+
((empty? and not etag) ? " /" : "") +
|
10
|
+
">"
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
module PertinentParser
|
15
|
+
class << self
|
16
|
+
# Better write our own traversal function so that we can screw with the HTML representation the way we like.
|
17
|
+
def html(html)
|
18
|
+
doc = Hpricot(html)
|
19
|
+
d = 0
|
20
|
+
t = text(doc.inner_text)
|
21
|
+
doc.traverse_all_element do |elem|
|
22
|
+
if elem.text?
|
23
|
+
#puts elem.inner_text
|
24
|
+
d += elem.inner_text.size
|
25
|
+
else
|
26
|
+
#puts elem.stag
|
27
|
+
t + wrap_(d...d+elem.inner_text.size, elem.stag)
|
28
|
+
#puts "#{d}..#{d+elem.inner_text.size}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
t
|
32
|
+
end
|
33
|
+
|
34
|
+
def text(s)
|
35
|
+
r = Rule.new((0..s.size-1), Transform.new(:identity, ["id"]))
|
36
|
+
t = Text.new(s)
|
37
|
+
t.rule = r
|
38
|
+
t
|
39
|
+
end
|
40
|
+
|
41
|
+
def offset_to_r(o)
|
42
|
+
(o[0]..o[1]-1)
|
43
|
+
end
|
44
|
+
|
45
|
+
def range_from_specification context, target, number
|
46
|
+
count, position = 0, 0
|
47
|
+
stored = []
|
48
|
+
re = Regexp.new(Regexp.escape(target))
|
49
|
+
while (match = context.match(re , position)) do
|
50
|
+
temp = match.offset 0
|
51
|
+
position += 1; count += 1 if temp != stored
|
52
|
+
return offset_to_r(temp) if count == number
|
53
|
+
stored = temp
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def new_wrap(context, target, number, tag)
|
58
|
+
range = range_from_specification(context, target, number)
|
59
|
+
wrap_(range, tag)
|
60
|
+
end
|
61
|
+
|
62
|
+
def rule(range, transform)
|
63
|
+
Rule.new(range, transform)
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
def wrap_(range, tag)
|
68
|
+
transform = Transform.new(:wrap, [tag, "</"+tag.match(/<(\S*)(\s|>)/)[1]+">" ])
|
69
|
+
r = Rule.new(range, transform)
|
70
|
+
end
|
71
|
+
|
72
|
+
def new_replace(context, target, number, replacement)
|
73
|
+
range = range_from_specification(context, target, number)
|
74
|
+
transform = Transform.new(:replacement, replacement)
|
75
|
+
r = Rule.new(range, transform)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
class Rule
|
2
|
+
attr_accessor :name, :children, :parent
|
3
|
+
attr_accessor :transform
|
4
|
+
attr_accessor :range
|
5
|
+
def initialize(range, transform=nil, children=[], parent=nil)
|
6
|
+
@range = range.to_a
|
7
|
+
@children = children
|
8
|
+
@parent = parent
|
9
|
+
@transform = transform
|
10
|
+
end
|
11
|
+
def <=>(r)
|
12
|
+
range.first <=> r.range.first
|
13
|
+
end
|
14
|
+
def apply_recur(s, offset=0)
|
15
|
+
pre = offset
|
16
|
+
@children.each do |child|
|
17
|
+
offset += child.apply_recur(s, offset)
|
18
|
+
end
|
19
|
+
# This was an optimization gone wrong. Sorry. Applies the transformation to the portion of the text.
|
20
|
+
return (s[@range.first+pre..@range.last+offset] = @transform.apply(s[@range.first+pre..@range.last+offset])).size - range.size
|
21
|
+
end
|
22
|
+
def apply(str)
|
23
|
+
s = str.dup
|
24
|
+
apply_recur(s)
|
25
|
+
return s
|
26
|
+
end
|
27
|
+
def +(text)
|
28
|
+
add(text.rule)
|
29
|
+
return text
|
30
|
+
end
|
31
|
+
def add(new_rule)
|
32
|
+
intersection = range & new_rule.range
|
33
|
+
if intersection == new_rule.range
|
34
|
+
contain = []
|
35
|
+
input = new_rule
|
36
|
+
@children.each do |child|
|
37
|
+
result = child.add(input)
|
38
|
+
case result
|
39
|
+
when Rule
|
40
|
+
input = result
|
41
|
+
when :inside
|
42
|
+
return :inside
|
43
|
+
when :contain
|
44
|
+
contain << child
|
45
|
+
when :outside
|
46
|
+
end
|
47
|
+
end
|
48
|
+
@children -= contain
|
49
|
+
contain.each do |child|
|
50
|
+
input.add child
|
51
|
+
end
|
52
|
+
@children << input
|
53
|
+
@children.sort!
|
54
|
+
return :inside
|
55
|
+
elsif intersection.empty?
|
56
|
+
return :outside
|
57
|
+
elsif intersection == range
|
58
|
+
if @parent.nil?
|
59
|
+
children = new_rule.children
|
60
|
+
new_rule.children = [self]
|
61
|
+
children.each do |child|
|
62
|
+
new_rule.add child
|
63
|
+
end
|
64
|
+
return new_rule
|
65
|
+
end
|
66
|
+
return :contain
|
67
|
+
else
|
68
|
+
difference = new_rule.range - intersection
|
69
|
+
transforms = new_rule.transform.split(difference.size)
|
70
|
+
if intersection.first < difference.first
|
71
|
+
inter_tran, diff_tran = transforms
|
72
|
+
else
|
73
|
+
diff_tran, inter_tran = transforms
|
74
|
+
end
|
75
|
+
self.add(Rule.new(intersection, inter_tran))
|
76
|
+
return Rule.new(difference, diff_tran)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
class Text < String
|
2
|
+
attr_accessor :rule
|
3
|
+
|
4
|
+
# Return the HTML after all rules are applied
|
5
|
+
def apply
|
6
|
+
@rule.apply(self)
|
7
|
+
end
|
8
|
+
|
9
|
+
undef +
|
10
|
+
def +(new_rule)
|
11
|
+
@rule.add(new_rule)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Wrap text, falling inside of existing boundaries
|
15
|
+
def wrap_in(tag, target, number=1)
|
16
|
+
self.+(PertinentParser.new_wrap(self, target, number, tag))
|
17
|
+
end
|
18
|
+
|
19
|
+
def replace(replacement, target, number=1)
|
20
|
+
self.+(PertinentParser.new_replace(self, target, number, replacement))
|
21
|
+
end
|
22
|
+
|
23
|
+
# Wrap text, falling outside of existing boundaries
|
24
|
+
def wrap_out(tag, target, number=1)
|
25
|
+
PertinentParser.new_wrap(self, target, number, tag).+(self)
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class Transform
|
2
|
+
attr_accessor :type, :property
|
3
|
+
|
4
|
+
def initialize type, property
|
5
|
+
@type, @property = type, property
|
6
|
+
end
|
7
|
+
|
8
|
+
def split(n)
|
9
|
+
if @type == :replacement
|
10
|
+
return [Transform.new(:replacement, @property[0..n-1]), Transform.new(:replacement, @property[n..-1])]
|
11
|
+
elsif @type == :wrap
|
12
|
+
return [self, self.dup]
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def apply(s)
|
17
|
+
if @type == :identity
|
18
|
+
return s
|
19
|
+
elsif @type == :replacement
|
20
|
+
return @property
|
21
|
+
elsif @type == :wrap
|
22
|
+
return @property[0] + s + @property[1]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
metadata
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pertinent_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Matthew Bunday
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-04-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: hpricot
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.8.6
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.8.6
|
27
|
+
description: PertinentParser helps you compose HTML tags across existing tag boundaries.
|
28
|
+
email: mkbunday@gmail.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- lib/pertinent_parser.rb
|
34
|
+
- lib/pertinent_parser/rule.rb
|
35
|
+
- lib/pertinent_parser/text.rb
|
36
|
+
- lib/pertinent_parser/transform.rb
|
37
|
+
homepage: https://github.com/zencephalon/Pertinent_Parser
|
38
|
+
licenses:
|
39
|
+
- MIT
|
40
|
+
metadata: {}
|
41
|
+
post_install_message:
|
42
|
+
rdoc_options: []
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '0'
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
requirements: []
|
56
|
+
rubyforge_project:
|
57
|
+
rubygems_version: 2.2.2
|
58
|
+
signing_key:
|
59
|
+
specification_version: 4
|
60
|
+
summary: PertinentParser helps you compose HTML tags across existing tag boundaries.
|
61
|
+
test_files: []
|