pertinent_parser 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/pertinent_parser.rb +78 -0
- data/lib/pertinent_parser/rule.rb +79 -0
- data/lib/pertinent_parser/text.rb +27 -0
- data/lib/pertinent_parser/transform.rb +25 -0
- metadata +61 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 82c1ea5ccb406cf088548e021a771378c318e3d1
|
4
|
+
data.tar.gz: 9b89619ed72dda0008192856ea343f624b0573b5
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e3d3d6f55364d41ab21ebce1e48a9d2e2b84aaf737afb7b8390a599bbd206dff1ea72874772f9e5dadb340efd794b0b3c62877540915529ff142a406e839ae37
|
7
|
+
data.tar.gz: 8238784a074f077775b6d8df61c792a77b5ff0f2e8f8b23c8418f3d4eecd5f7ad15edd4059374744240b8dac1445ac85090a5503e79a603fa1f0da5f9cd54073
|
@@ -0,0 +1,78 @@
|
|
1
|
+
require "hpricot"
|
2
|
+
require "pertinent_parser/transform"
|
3
|
+
require "pertinent_parser/rule"
|
4
|
+
require "pertinent_parser/text"
|
5
|
+
|
6
|
+
class Hpricot::Elem
|
7
|
+
def stag
|
8
|
+
"<#{name}#{attributes_as_html}" +
|
9
|
+
((empty? and not etag) ? " /" : "") +
|
10
|
+
">"
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
module PertinentParser
|
15
|
+
class << self
|
16
|
+
# Better write our own traversal function so that we can screw with the HTML representation the way we like.
|
17
|
+
def html(html)
|
18
|
+
doc = Hpricot(html)
|
19
|
+
d = 0
|
20
|
+
t = text(doc.inner_text)
|
21
|
+
doc.traverse_all_element do |elem|
|
22
|
+
if elem.text?
|
23
|
+
#puts elem.inner_text
|
24
|
+
d += elem.inner_text.size
|
25
|
+
else
|
26
|
+
#puts elem.stag
|
27
|
+
t + wrap_(d...d+elem.inner_text.size, elem.stag)
|
28
|
+
#puts "#{d}..#{d+elem.inner_text.size}"
|
29
|
+
end
|
30
|
+
end
|
31
|
+
t
|
32
|
+
end
|
33
|
+
|
34
|
+
def text(s)
|
35
|
+
r = Rule.new((0..s.size-1), Transform.new(:identity, ["id"]))
|
36
|
+
t = Text.new(s)
|
37
|
+
t.rule = r
|
38
|
+
t
|
39
|
+
end
|
40
|
+
|
41
|
+
def offset_to_r(o)
|
42
|
+
(o[0]..o[1]-1)
|
43
|
+
end
|
44
|
+
|
45
|
+
def range_from_specification context, target, number
|
46
|
+
count, position = 0, 0
|
47
|
+
stored = []
|
48
|
+
re = Regexp.new(Regexp.escape(target))
|
49
|
+
while (match = context.match(re , position)) do
|
50
|
+
temp = match.offset 0
|
51
|
+
position += 1; count += 1 if temp != stored
|
52
|
+
return offset_to_r(temp) if count == number
|
53
|
+
stored = temp
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def new_wrap(context, target, number, tag)
|
58
|
+
range = range_from_specification(context, target, number)
|
59
|
+
wrap_(range, tag)
|
60
|
+
end
|
61
|
+
|
62
|
+
def rule(range, transform)
|
63
|
+
Rule.new(range, transform)
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
def wrap_(range, tag)
|
68
|
+
transform = Transform.new(:wrap, [tag, "</"+tag.match(/<(\S*)(\s|>)/)[1]+">" ])
|
69
|
+
r = Rule.new(range, transform)
|
70
|
+
end
|
71
|
+
|
72
|
+
def new_replace(context, target, number, replacement)
|
73
|
+
range = range_from_specification(context, target, number)
|
74
|
+
transform = Transform.new(:replacement, replacement)
|
75
|
+
r = Rule.new(range, transform)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
class Rule
|
2
|
+
attr_accessor :name, :children, :parent
|
3
|
+
attr_accessor :transform
|
4
|
+
attr_accessor :range
|
5
|
+
def initialize(range, transform=nil, children=[], parent=nil)
|
6
|
+
@range = range.to_a
|
7
|
+
@children = children
|
8
|
+
@parent = parent
|
9
|
+
@transform = transform
|
10
|
+
end
|
11
|
+
def <=>(r)
|
12
|
+
range.first <=> r.range.first
|
13
|
+
end
|
14
|
+
def apply_recur(s, offset=0)
|
15
|
+
pre = offset
|
16
|
+
@children.each do |child|
|
17
|
+
offset += child.apply_recur(s, offset)
|
18
|
+
end
|
19
|
+
# This was an optimization gone wrong. Sorry. Applies the transformation to the portion of the text.
|
20
|
+
return (s[@range.first+pre..@range.last+offset] = @transform.apply(s[@range.first+pre..@range.last+offset])).size - range.size
|
21
|
+
end
|
22
|
+
def apply(str)
|
23
|
+
s = str.dup
|
24
|
+
apply_recur(s)
|
25
|
+
return s
|
26
|
+
end
|
27
|
+
def +(text)
|
28
|
+
add(text.rule)
|
29
|
+
return text
|
30
|
+
end
|
31
|
+
def add(new_rule)
|
32
|
+
intersection = range & new_rule.range
|
33
|
+
if intersection == new_rule.range
|
34
|
+
contain = []
|
35
|
+
input = new_rule
|
36
|
+
@children.each do |child|
|
37
|
+
result = child.add(input)
|
38
|
+
case result
|
39
|
+
when Rule
|
40
|
+
input = result
|
41
|
+
when :inside
|
42
|
+
return :inside
|
43
|
+
when :contain
|
44
|
+
contain << child
|
45
|
+
when :outside
|
46
|
+
end
|
47
|
+
end
|
48
|
+
@children -= contain
|
49
|
+
contain.each do |child|
|
50
|
+
input.add child
|
51
|
+
end
|
52
|
+
@children << input
|
53
|
+
@children.sort!
|
54
|
+
return :inside
|
55
|
+
elsif intersection.empty?
|
56
|
+
return :outside
|
57
|
+
elsif intersection == range
|
58
|
+
if @parent.nil?
|
59
|
+
children = new_rule.children
|
60
|
+
new_rule.children = [self]
|
61
|
+
children.each do |child|
|
62
|
+
new_rule.add child
|
63
|
+
end
|
64
|
+
return new_rule
|
65
|
+
end
|
66
|
+
return :contain
|
67
|
+
else
|
68
|
+
difference = new_rule.range - intersection
|
69
|
+
transforms = new_rule.transform.split(difference.size)
|
70
|
+
if intersection.first < difference.first
|
71
|
+
inter_tran, diff_tran = transforms
|
72
|
+
else
|
73
|
+
diff_tran, inter_tran = transforms
|
74
|
+
end
|
75
|
+
self.add(Rule.new(intersection, inter_tran))
|
76
|
+
return Rule.new(difference, diff_tran)
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
class Text < String
|
2
|
+
attr_accessor :rule
|
3
|
+
|
4
|
+
# Return the HTML after all rules are applied
|
5
|
+
def apply
|
6
|
+
@rule.apply(self)
|
7
|
+
end
|
8
|
+
|
9
|
+
undef +
|
10
|
+
def +(new_rule)
|
11
|
+
@rule.add(new_rule)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Wrap text, falling inside of existing boundaries
|
15
|
+
def wrap_in(tag, target, number=1)
|
16
|
+
self.+(PertinentParser.new_wrap(self, target, number, tag))
|
17
|
+
end
|
18
|
+
|
19
|
+
def replace(replacement, target, number=1)
|
20
|
+
self.+(PertinentParser.new_replace(self, target, number, replacement))
|
21
|
+
end
|
22
|
+
|
23
|
+
# Wrap text, falling outside of existing boundaries
|
24
|
+
def wrap_out(tag, target, number=1)
|
25
|
+
PertinentParser.new_wrap(self, target, number, tag).+(self)
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class Transform
|
2
|
+
attr_accessor :type, :property
|
3
|
+
|
4
|
+
def initialize type, property
|
5
|
+
@type, @property = type, property
|
6
|
+
end
|
7
|
+
|
8
|
+
def split(n)
|
9
|
+
if @type == :replacement
|
10
|
+
return [Transform.new(:replacement, @property[0..n-1]), Transform.new(:replacement, @property[n..-1])]
|
11
|
+
elsif @type == :wrap
|
12
|
+
return [self, self.dup]
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def apply(s)
|
17
|
+
if @type == :identity
|
18
|
+
return s
|
19
|
+
elsif @type == :replacement
|
20
|
+
return @property
|
21
|
+
elsif @type == :wrap
|
22
|
+
return @property[0] + s + @property[1]
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
metadata
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: pertinent_parser
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Matthew Bunday
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-04-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: hpricot
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.8.6
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.8.6
|
27
|
+
description: PertinentParser helps you compose HTML tags across existing tag boundaries.
|
28
|
+
email: mkbunday@gmail.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- lib/pertinent_parser.rb
|
34
|
+
- lib/pertinent_parser/rule.rb
|
35
|
+
- lib/pertinent_parser/text.rb
|
36
|
+
- lib/pertinent_parser/transform.rb
|
37
|
+
homepage: https://github.com/zencephalon/Pertinent_Parser
|
38
|
+
licenses:
|
39
|
+
- MIT
|
40
|
+
metadata: {}
|
41
|
+
post_install_message:
|
42
|
+
rdoc_options: []
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '0'
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
requirements: []
|
56
|
+
rubyforge_project:
|
57
|
+
rubygems_version: 2.2.2
|
58
|
+
signing_key:
|
59
|
+
specification_version: 4
|
60
|
+
summary: PertinentParser helps you compose HTML tags across existing tag boundaries.
|
61
|
+
test_files: []
|