escapement 1.0.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +14 -2
- data/lib/escapement.rb +2 -1
- data/lib/escapement/element.rb +30 -0
- data/lib/escapement/elements/base.rb +30 -0
- data/lib/escapement/elements/list.rb +20 -0
- data/lib/escapement/elements/ordered_list.rb +11 -0
- data/lib/escapement/elements/paragraph.rb +22 -0
- data/lib/escapement/elements/unordered_list.rb +11 -0
- data/lib/escapement/html.rb +6 -6
- data/lib/escapement/pretty_names.rb +24 -0
- data/lib/escapement/tag.rb +1 -19
- data/lib/escapement/traversal.rb +1 -1
- data/lib/escapement/version.rb +1 -1
- metadata +9 -3
- data/lib/escapement/block.rb +0 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a022be5b1869ab5770ced1ed81a0ffa124739244
|
4
|
+
data.tar.gz: e808af33e9c5a56413dfc3973ccca4ffdce5da60
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 909dd0b5780eea310435ada122f4b93529befa2f2af7be25d5e590cc840349704cc0cedd6e9cf8f9cea8f20f3ac8ddbeb78477ffc77533426e9670c199f4e955
|
7
|
+
data.tar.gz: 35e0febb031007db8d9da3bd114ec530595e6d8eeb9700feebd15699ffaa37ebf35a7025cc9b77c604799df5d49edfe4787cd24f77b9a700518099ccd55dc7f8
|
data/README.md
CHANGED
@@ -20,7 +20,7 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
Basic usage is very straightforward. Escapement will consider all root-level tags as separate
|
23
|
+
Basic usage is very straightforward. Escapement will consider all root-level tags as separate elements.
|
24
24
|
|
25
25
|
The position values are 0-based and are relative to the plain text result. The first value is the start of the attributed text, and the second is the end of the attributed text.
|
26
26
|
|
@@ -30,7 +30,19 @@ body = "<p>Isn't <i>Tourbillon</i> a <a href=\"http://google.com\">great</a> wor
|
|
30
30
|
html = Escapement::HTML.new(body)
|
31
31
|
html.extract!
|
32
32
|
html.results
|
33
|
-
# => [{:text=>"Isn't Tourbillon a great word?", :entities=>[{:type=>"italic", :html_tag=>"i", :position=>[6, 16], :attributes=>{}}, {:type=>"link", :html_tag=>"a", :position=>[19, 24], :attributes=>{"href"=>"http://google.com"}}]}]
|
33
|
+
# => [{:type=>"paragraph", :html_tag=>"p", :text=>"Isn't Tourbillon a great word?", :entities=>[{:type=>"italic", :html_tag=>"i", :position=>[6, 16], :attributes=>{}}, {:type=>"link", :html_tag=>"a", :position=>[19, 24], :attributes=>{"href"=>"http://google.com"}}]}]
|
34
|
+
```
|
35
|
+
|
36
|
+
Escapement also supports lists (with nesting), which treats each list item as a separate paragraph-like element.
|
37
|
+
|
38
|
+
``` ruby
|
39
|
+
body = "<ul><li>List item 1</li><ul><li><b>Nested</b> list item</li></ul><li>List item 2</li></ul>"
|
40
|
+
|
41
|
+
html = Escapement::HTML.new(body)
|
42
|
+
html.extract!
|
43
|
+
html.results
|
44
|
+
|
45
|
+
# => [{:type=>"unordered_list", :html_tag=>"ul", :children=>[{:type=>"list_item", :html_tag=>"li", :text=>"List item 1", :entities=>[]}, {:type=>"unordered_list", :html_tag=>"ul", :children=>[{:type=>"list_item", :html_tag=>"li", :text=>"Nested list item", :entities=>[{:type=>"bold", :html_tag=>"b", :position=>[0, 6], :attributes=>{}}]}]}, {:type=>"list_item", :html_tag=>"li", :text=>"List item 2", :entities=>[]}]}]
|
34
46
|
```
|
35
47
|
|
36
48
|
## How It Works
|
data/lib/escapement.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
require "nokogiri"
|
2
2
|
require "escapement/traversal"
|
3
|
+
require "escapement/pretty_names"
|
3
4
|
require "escapement/attributes"
|
4
|
-
require "escapement/
|
5
|
+
require "escapement/element"
|
5
6
|
require "escapement/html"
|
6
7
|
require "escapement/tag"
|
7
8
|
require "escapement/version"
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'escapement/elements/base'
|
2
|
+
require 'escapement/elements/paragraph'
|
3
|
+
require 'escapement/elements/list'
|
4
|
+
require 'escapement/elements/ordered_list'
|
5
|
+
require 'escapement/elements/unordered_list'
|
6
|
+
|
7
|
+
module Escapement
|
8
|
+
# An element represents a root-level element in the given
|
9
|
+
# HTML string. Each paragraph has it's own text value and
|
10
|
+
# array of entities.
|
11
|
+
module Element
|
12
|
+
extend self
|
13
|
+
|
14
|
+
NODE_TYPES = [
|
15
|
+
Element::Paragraph,
|
16
|
+
Element::OrderedList,
|
17
|
+
Element::UnorderedList
|
18
|
+
].freeze
|
19
|
+
|
20
|
+
def factory(node)
|
21
|
+
NODE_TYPES.each do |type|
|
22
|
+
if type.should_handle?(node)
|
23
|
+
return type.new(node)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
nil
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Escapement
|
2
|
+
module Element
|
3
|
+
class Base
|
4
|
+
include Traversal
|
5
|
+
include PrettyNames
|
6
|
+
|
7
|
+
attr_reader :node, :result
|
8
|
+
|
9
|
+
def initialize(node)
|
10
|
+
@node = node
|
11
|
+
@entities = []
|
12
|
+
@result = nil
|
13
|
+
@current_position = 0
|
14
|
+
end
|
15
|
+
|
16
|
+
def process!
|
17
|
+
raise "Override"
|
18
|
+
end
|
19
|
+
|
20
|
+
protected
|
21
|
+
|
22
|
+
def node_is_blank?
|
23
|
+
# This will match empty strings, strings with spaces, and
|
24
|
+
# even strings with unicode non-breakable spaces (which can be
|
25
|
+
# produced by )
|
26
|
+
node.text =~ /\A[[:space:]]*\z/
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Escapement
|
2
|
+
module Element
|
3
|
+
module List
|
4
|
+
def process!
|
5
|
+
return if node_is_blank?
|
6
|
+
|
7
|
+
@entities = node.children.map { |child|
|
8
|
+
next if child.text?
|
9
|
+
Element.factory(child).tap(&:process!)
|
10
|
+
}.compact
|
11
|
+
|
12
|
+
@result = {
|
13
|
+
type: node_to_type,
|
14
|
+
html_tag: node.name,
|
15
|
+
children: @entities.map(&:result)
|
16
|
+
}
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Escapement
|
2
|
+
module Element
|
3
|
+
class Paragraph < Base
|
4
|
+
def self.should_handle?(node)
|
5
|
+
['p', 'li'].include? node.name
|
6
|
+
end
|
7
|
+
|
8
|
+
def process!
|
9
|
+
return if node_is_blank?
|
10
|
+
|
11
|
+
process_children
|
12
|
+
|
13
|
+
@result = {
|
14
|
+
type: node_to_type,
|
15
|
+
html_tag: node.name,
|
16
|
+
text: node.text,
|
17
|
+
entities: @entities
|
18
|
+
}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/escapement/html.rb
CHANGED
@@ -1,22 +1,22 @@
|
|
1
1
|
module Escapement
|
2
2
|
# Wrapper around the entire document, which contains an array of
|
3
3
|
# results. Each result is the text value and entities for a single
|
4
|
-
#
|
4
|
+
# element.
|
5
5
|
class HTML
|
6
|
-
attr_reader :doc, :
|
6
|
+
attr_reader :doc, :elements, :results
|
7
7
|
|
8
8
|
def initialize(html)
|
9
9
|
@doc = Nokogiri::HTML(html)
|
10
|
-
@
|
10
|
+
@elements = []
|
11
11
|
@results = nil
|
12
12
|
end
|
13
13
|
|
14
|
-
# Extracts all of the entities for each
|
14
|
+
# Extracts all of the entities for each element.
|
15
15
|
def extract!
|
16
16
|
preprocess!
|
17
17
|
|
18
|
-
@
|
19
|
-
@results = @
|
18
|
+
@elements = doc.css('body').children.map { |child| Element.factory(child) }.compact
|
19
|
+
@results = @elements.each(&:process!).reject { |e| e.result.nil? }.map(&:result)
|
20
20
|
end
|
21
21
|
|
22
22
|
private
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Escapement
|
2
|
+
module PrettyNames
|
3
|
+
def node_to_type
|
4
|
+
case node.name
|
5
|
+
when 'p' then 'paragraph'
|
6
|
+
when 'ul' then 'unordered_list'
|
7
|
+
when 'ol' then 'ordered_list'
|
8
|
+
when 'a' then 'link'
|
9
|
+
when 'i', 'em' then 'italic'
|
10
|
+
when 'u' then 'underline'
|
11
|
+
when 'strong', 'b' then 'bold'
|
12
|
+
when 'abbr' then 'abbreviation'
|
13
|
+
when 'q' then 'quote'
|
14
|
+
when 'pre' then 'preformatted'
|
15
|
+
when 'img' then 'image'
|
16
|
+
when 'li' then 'list_item'
|
17
|
+
when 'sup' then 'superscript'
|
18
|
+
when 'sub' then 'subscript'
|
19
|
+
when /h\d/ then 'header'
|
20
|
+
else node.name
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/escapement/tag.rb
CHANGED
@@ -4,6 +4,7 @@ module Escapement
|
|
4
4
|
# the traversal until we reach the leaf text node.
|
5
5
|
class Tag
|
6
6
|
include Traversal
|
7
|
+
include PrettyNames
|
7
8
|
|
8
9
|
attr_reader :node, :entities
|
9
10
|
|
@@ -26,25 +27,6 @@ module Escapement
|
|
26
27
|
|
27
28
|
private
|
28
29
|
|
29
|
-
def node_to_type
|
30
|
-
case node.name
|
31
|
-
when 'p' then 'paragraph'
|
32
|
-
when 'a' then 'link'
|
33
|
-
when 'i', 'em' then 'italic'
|
34
|
-
when 'u' then 'underline'
|
35
|
-
when 'strong', 'b' then 'bold'
|
36
|
-
when 'abbr' then 'abbreviation'
|
37
|
-
when 'q' then 'quote'
|
38
|
-
when 'pre' then 'preformatted'
|
39
|
-
when 'img' then 'image'
|
40
|
-
when 'li' then 'list_item'
|
41
|
-
when 'sup' then 'superscript'
|
42
|
-
when 'sub' then 'subscript'
|
43
|
-
when /h\d/ then 'header'
|
44
|
-
else node.name
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
30
|
def filtered_attributes
|
49
31
|
method_name = Attributes.respond_to?(node.name) ? node.name : :default
|
50
32
|
node.attributes.select(&Attributes.method(method_name))
|
data/lib/escapement/traversal.rb
CHANGED
@@ -2,7 +2,7 @@ module Escapement
|
|
2
2
|
module Traversal
|
3
3
|
# Processes all child nodes of the current node. As the recursion unwinds, we
|
4
4
|
# update the entities array such that we're left with a full result set at
|
5
|
-
# the root, which is the
|
5
|
+
# the root, which is the Element object.
|
6
6
|
def process_children
|
7
7
|
node.children.each do |child|
|
8
8
|
if child.text?
|
data/lib/escapement/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: escapement
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan LeFevre
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-03-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -87,8 +87,14 @@ files:
|
|
87
87
|
- escapement.gemspec
|
88
88
|
- lib/escapement.rb
|
89
89
|
- lib/escapement/attributes.rb
|
90
|
-
- lib/escapement/
|
90
|
+
- lib/escapement/element.rb
|
91
|
+
- lib/escapement/elements/base.rb
|
92
|
+
- lib/escapement/elements/list.rb
|
93
|
+
- lib/escapement/elements/ordered_list.rb
|
94
|
+
- lib/escapement/elements/paragraph.rb
|
95
|
+
- lib/escapement/elements/unordered_list.rb
|
91
96
|
- lib/escapement/html.rb
|
97
|
+
- lib/escapement/pretty_names.rb
|
92
98
|
- lib/escapement/tag.rb
|
93
99
|
- lib/escapement/traversal.rb
|
94
100
|
- lib/escapement/version.rb
|
data/lib/escapement/block.rb
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
module Escapement
|
2
|
-
# A block represents a paragraph, which is a root-level element in the
|
3
|
-
# given HTML string. Each paragraph has it's own text value and array of entities.
|
4
|
-
class Block
|
5
|
-
include Traversal
|
6
|
-
|
7
|
-
attr_reader :node, :result
|
8
|
-
|
9
|
-
def initialize(node)
|
10
|
-
@node = node
|
11
|
-
@entities = []
|
12
|
-
@result = nil
|
13
|
-
@current_position = 0
|
14
|
-
end
|
15
|
-
|
16
|
-
def process!
|
17
|
-
# This will match empty strings, strings with spaces, and
|
18
|
-
# even strings with unicode non-breakable spaces (which can be
|
19
|
-
# produced by )
|
20
|
-
return if node.text =~ /\A[[:space:]]*\z/
|
21
|
-
|
22
|
-
process_children
|
23
|
-
|
24
|
-
@result = {
|
25
|
-
text: node.text,
|
26
|
-
entities: @entities
|
27
|
-
}
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|