escapement 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +14 -2
- data/lib/escapement.rb +2 -1
- data/lib/escapement/element.rb +30 -0
- data/lib/escapement/elements/base.rb +30 -0
- data/lib/escapement/elements/list.rb +20 -0
- data/lib/escapement/elements/ordered_list.rb +11 -0
- data/lib/escapement/elements/paragraph.rb +22 -0
- data/lib/escapement/elements/unordered_list.rb +11 -0
- data/lib/escapement/html.rb +6 -6
- data/lib/escapement/pretty_names.rb +24 -0
- data/lib/escapement/tag.rb +1 -19
- data/lib/escapement/traversal.rb +1 -1
- data/lib/escapement/version.rb +1 -1
- metadata +9 -3
- data/lib/escapement/block.rb +0 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a022be5b1869ab5770ced1ed81a0ffa124739244
|
4
|
+
data.tar.gz: e808af33e9c5a56413dfc3973ccca4ffdce5da60
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 909dd0b5780eea310435ada122f4b93529befa2f2af7be25d5e590cc840349704cc0cedd6e9cf8f9cea8f20f3ac8ddbeb78477ffc77533426e9670c199f4e955
|
7
|
+
data.tar.gz: 35e0febb031007db8d9da3bd114ec530595e6d8eeb9700feebd15699ffaa37ebf35a7025cc9b77c604799df5d49edfe4787cd24f77b9a700518099ccd55dc7f8
|
data/README.md
CHANGED
@@ -20,7 +20,7 @@ Or install it yourself as:
|
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
|
-
Basic usage is very straightforward. Escapement will consider all root-level tags as separate
|
23
|
+
Basic usage is very straightforward. Escapement will consider all root-level tags as separate elements.
|
24
24
|
|
25
25
|
The position values are 0-based and are relative to the plain text result. The first value is the start of the attributed text, and the second is the end of the attributed text.
|
26
26
|
|
@@ -30,7 +30,19 @@ body = "<p>Isn't <i>Tourbillon</i> a <a href=\"http://google.com\">great</a> wor
|
|
30
30
|
html = Escapement::HTML.new(body)
|
31
31
|
html.extract!
|
32
32
|
html.results
|
33
|
-
# => [{:text=>"Isn't Tourbillon a great word?", :entities=>[{:type=>"italic", :html_tag=>"i", :position=>[6, 16], :attributes=>{}}, {:type=>"link", :html_tag=>"a", :position=>[19, 24], :attributes=>{"href"=>"http://google.com"}}]}]
|
33
|
+
# => [{:type=>"paragraph", :html_tag=>"p", :text=>"Isn't Tourbillon a great word?", :entities=>[{:type=>"italic", :html_tag=>"i", :position=>[6, 16], :attributes=>{}}, {:type=>"link", :html_tag=>"a", :position=>[19, 24], :attributes=>{"href"=>"http://google.com"}}]}]
|
34
|
+
```
|
35
|
+
|
36
|
+
Escapement also supports lists (with nesting), which treats each list item as a separate paragraph-like element.
|
37
|
+
|
38
|
+
``` ruby
|
39
|
+
body = "<ul><li>List item 1</li><ul><li><b>Nested</b> list item</li></ul><li>List item 2</li></ul>"
|
40
|
+
|
41
|
+
html = Escapement::HTML.new(body)
|
42
|
+
html.extract!
|
43
|
+
html.results
|
44
|
+
|
45
|
+
# => [{:type=>"unordered_list", :html_tag=>"ul", :children=>[{:type=>"list_item", :html_tag=>"li", :text=>"List item 1", :entities=>[]}, {:type=>"unordered_list", :html_tag=>"ul", :children=>[{:type=>"list_item", :html_tag=>"li", :text=>"Nested list item", :entities=>[{:type=>"bold", :html_tag=>"b", :position=>[0, 6], :attributes=>{}}]}]}, {:type=>"list_item", :html_tag=>"li", :text=>"List item 2", :entities=>[]}]}]
|
34
46
|
```
|
35
47
|
|
36
48
|
## How It Works
|
data/lib/escapement.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
require "nokogiri"
|
2
2
|
require "escapement/traversal"
|
3
|
+
require "escapement/pretty_names"
|
3
4
|
require "escapement/attributes"
|
4
|
-
require "escapement/
|
5
|
+
require "escapement/element"
|
5
6
|
require "escapement/html"
|
6
7
|
require "escapement/tag"
|
7
8
|
require "escapement/version"
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'escapement/elements/base'
|
2
|
+
require 'escapement/elements/paragraph'
|
3
|
+
require 'escapement/elements/list'
|
4
|
+
require 'escapement/elements/ordered_list'
|
5
|
+
require 'escapement/elements/unordered_list'
|
6
|
+
|
7
|
+
module Escapement
|
8
|
+
# An element represents a root-level element in the given
|
9
|
+
# HTML string. Each paragraph has it's own text value and
|
10
|
+
# array of entities.
|
11
|
+
module Element
|
12
|
+
extend self
|
13
|
+
|
14
|
+
NODE_TYPES = [
|
15
|
+
Element::Paragraph,
|
16
|
+
Element::OrderedList,
|
17
|
+
Element::UnorderedList
|
18
|
+
].freeze
|
19
|
+
|
20
|
+
def factory(node)
|
21
|
+
NODE_TYPES.each do |type|
|
22
|
+
if type.should_handle?(node)
|
23
|
+
return type.new(node)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
nil
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Escapement
|
2
|
+
module Element
|
3
|
+
class Base
|
4
|
+
include Traversal
|
5
|
+
include PrettyNames
|
6
|
+
|
7
|
+
attr_reader :node, :result
|
8
|
+
|
9
|
+
def initialize(node)
|
10
|
+
@node = node
|
11
|
+
@entities = []
|
12
|
+
@result = nil
|
13
|
+
@current_position = 0
|
14
|
+
end
|
15
|
+
|
16
|
+
def process!
|
17
|
+
raise "Override"
|
18
|
+
end
|
19
|
+
|
20
|
+
protected
|
21
|
+
|
22
|
+
def node_is_blank?
|
23
|
+
# This will match empty strings, strings with spaces, and
|
24
|
+
# even strings with unicode non-breakable spaces (which can be
|
25
|
+
# produced by )
|
26
|
+
node.text =~ /\A[[:space:]]*\z/
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Escapement
|
2
|
+
module Element
|
3
|
+
module List
|
4
|
+
def process!
|
5
|
+
return if node_is_blank?
|
6
|
+
|
7
|
+
@entities = node.children.map { |child|
|
8
|
+
next if child.text?
|
9
|
+
Element.factory(child).tap(&:process!)
|
10
|
+
}.compact
|
11
|
+
|
12
|
+
@result = {
|
13
|
+
type: node_to_type,
|
14
|
+
html_tag: node.name,
|
15
|
+
children: @entities.map(&:result)
|
16
|
+
}
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Escapement
|
2
|
+
module Element
|
3
|
+
class Paragraph < Base
|
4
|
+
def self.should_handle?(node)
|
5
|
+
['p', 'li'].include? node.name
|
6
|
+
end
|
7
|
+
|
8
|
+
def process!
|
9
|
+
return if node_is_blank?
|
10
|
+
|
11
|
+
process_children
|
12
|
+
|
13
|
+
@result = {
|
14
|
+
type: node_to_type,
|
15
|
+
html_tag: node.name,
|
16
|
+
text: node.text,
|
17
|
+
entities: @entities
|
18
|
+
}
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/escapement/html.rb
CHANGED
@@ -1,22 +1,22 @@
|
|
1
1
|
module Escapement
|
2
2
|
# Wrapper around the entire document, which contains an array of
|
3
3
|
# results. Each result is the text value and entities for a single
|
4
|
-
#
|
4
|
+
# element.
|
5
5
|
class HTML
|
6
|
-
attr_reader :doc, :
|
6
|
+
attr_reader :doc, :elements, :results
|
7
7
|
|
8
8
|
def initialize(html)
|
9
9
|
@doc = Nokogiri::HTML(html)
|
10
|
-
@
|
10
|
+
@elements = []
|
11
11
|
@results = nil
|
12
12
|
end
|
13
13
|
|
14
|
-
# Extracts all of the entities for each
|
14
|
+
# Extracts all of the entities for each element.
|
15
15
|
def extract!
|
16
16
|
preprocess!
|
17
17
|
|
18
|
-
@
|
19
|
-
@results = @
|
18
|
+
@elements = doc.css('body').children.map { |child| Element.factory(child) }.compact
|
19
|
+
@results = @elements.each(&:process!).reject { |e| e.result.nil? }.map(&:result)
|
20
20
|
end
|
21
21
|
|
22
22
|
private
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Escapement
|
2
|
+
module PrettyNames
|
3
|
+
def node_to_type
|
4
|
+
case node.name
|
5
|
+
when 'p' then 'paragraph'
|
6
|
+
when 'ul' then 'unordered_list'
|
7
|
+
when 'ol' then 'ordered_list'
|
8
|
+
when 'a' then 'link'
|
9
|
+
when 'i', 'em' then 'italic'
|
10
|
+
when 'u' then 'underline'
|
11
|
+
when 'strong', 'b' then 'bold'
|
12
|
+
when 'abbr' then 'abbreviation'
|
13
|
+
when 'q' then 'quote'
|
14
|
+
when 'pre' then 'preformatted'
|
15
|
+
when 'img' then 'image'
|
16
|
+
when 'li' then 'list_item'
|
17
|
+
when 'sup' then 'superscript'
|
18
|
+
when 'sub' then 'subscript'
|
19
|
+
when /h\d/ then 'header'
|
20
|
+
else node.name
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/escapement/tag.rb
CHANGED
@@ -4,6 +4,7 @@ module Escapement
|
|
4
4
|
# the traversal until we reach the leaf text node.
|
5
5
|
class Tag
|
6
6
|
include Traversal
|
7
|
+
include PrettyNames
|
7
8
|
|
8
9
|
attr_reader :node, :entities
|
9
10
|
|
@@ -26,25 +27,6 @@ module Escapement
|
|
26
27
|
|
27
28
|
private
|
28
29
|
|
29
|
-
def node_to_type
|
30
|
-
case node.name
|
31
|
-
when 'p' then 'paragraph'
|
32
|
-
when 'a' then 'link'
|
33
|
-
when 'i', 'em' then 'italic'
|
34
|
-
when 'u' then 'underline'
|
35
|
-
when 'strong', 'b' then 'bold'
|
36
|
-
when 'abbr' then 'abbreviation'
|
37
|
-
when 'q' then 'quote'
|
38
|
-
when 'pre' then 'preformatted'
|
39
|
-
when 'img' then 'image'
|
40
|
-
when 'li' then 'list_item'
|
41
|
-
when 'sup' then 'superscript'
|
42
|
-
when 'sub' then 'subscript'
|
43
|
-
when /h\d/ then 'header'
|
44
|
-
else node.name
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
30
|
def filtered_attributes
|
49
31
|
method_name = Attributes.respond_to?(node.name) ? node.name : :default
|
50
32
|
node.attributes.select(&Attributes.method(method_name))
|
data/lib/escapement/traversal.rb
CHANGED
@@ -2,7 +2,7 @@ module Escapement
|
|
2
2
|
module Traversal
|
3
3
|
# Processes all child nodes of the current node. As the recursion unwinds, we
|
4
4
|
# update the entities array such that we're left with a full result set at
|
5
|
-
# the root, which is the
|
5
|
+
# the root, which is the Element object.
|
6
6
|
def process_children
|
7
7
|
node.children.each do |child|
|
8
8
|
if child.text?
|
data/lib/escapement/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: escapement
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ryan LeFevre
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-03-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -87,8 +87,14 @@ files:
|
|
87
87
|
- escapement.gemspec
|
88
88
|
- lib/escapement.rb
|
89
89
|
- lib/escapement/attributes.rb
|
90
|
-
- lib/escapement/
|
90
|
+
- lib/escapement/element.rb
|
91
|
+
- lib/escapement/elements/base.rb
|
92
|
+
- lib/escapement/elements/list.rb
|
93
|
+
- lib/escapement/elements/ordered_list.rb
|
94
|
+
- lib/escapement/elements/paragraph.rb
|
95
|
+
- lib/escapement/elements/unordered_list.rb
|
91
96
|
- lib/escapement/html.rb
|
97
|
+
- lib/escapement/pretty_names.rb
|
92
98
|
- lib/escapement/tag.rb
|
93
99
|
- lib/escapement/traversal.rb
|
94
100
|
- lib/escapement/version.rb
|
data/lib/escapement/block.rb
DELETED
@@ -1,30 +0,0 @@
|
|
1
|
-
module Escapement
|
2
|
-
# A block represents a paragraph, which is a root-level element in the
|
3
|
-
# given HTML string. Each paragraph has it's own text value and array of entities.
|
4
|
-
class Block
|
5
|
-
include Traversal
|
6
|
-
|
7
|
-
attr_reader :node, :result
|
8
|
-
|
9
|
-
def initialize(node)
|
10
|
-
@node = node
|
11
|
-
@entities = []
|
12
|
-
@result = nil
|
13
|
-
@current_position = 0
|
14
|
-
end
|
15
|
-
|
16
|
-
def process!
|
17
|
-
# This will match empty strings, strings with spaces, and
|
18
|
-
# even strings with unicode non-breakable spaces (which can be
|
19
|
-
# produced by )
|
20
|
-
return if node.text =~ /\A[[:space:]]*\z/
|
21
|
-
|
22
|
-
process_children
|
23
|
-
|
24
|
-
@result = {
|
25
|
-
text: node.text,
|
26
|
-
entities: @entities
|
27
|
-
}
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|