escapement 1.0.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6fa90e5289492fd35e77857b9caac7a9d70f2bee
4
- data.tar.gz: e1786a0edbbb18ec4aa741cd762d73abdd6f20bf
3
+ metadata.gz: a022be5b1869ab5770ced1ed81a0ffa124739244
4
+ data.tar.gz: e808af33e9c5a56413dfc3973ccca4ffdce5da60
5
5
  SHA512:
6
- metadata.gz: a11eca3bcb49edf065e4c5caec62dd61588e77febaea83cbbe757ac8e701ae544aaa1336e6294a69a92899c8e0b5aba4d87d592e543eae3f78f5a276b3ce7750
7
- data.tar.gz: a82bf4073571a62cbcac6416680271e253d5d55af1624f9b916a27288d846d803e9434c48f04f8919655d7c49c522102bd9067d684d8548b7773ef3e4fe478a2
6
+ metadata.gz: 909dd0b5780eea310435ada122f4b93529befa2f2af7be25d5e590cc840349704cc0cedd6e9cf8f9cea8f20f3ac8ddbeb78477ffc77533426e9670c199f4e955
7
+ data.tar.gz: 35e0febb031007db8d9da3bd114ec530595e6d8eeb9700feebd15699ffaa37ebf35a7025cc9b77c604799df5d49edfe4787cd24f77b9a700518099ccd55dc7f8
data/README.md CHANGED
@@ -20,7 +20,7 @@ Or install it yourself as:
20
20
 
21
21
  ## Usage
22
22
 
23
- Basic usage is very straightforward. Escapement will consider all root-level tags as separate paragraphs.
23
+ Basic usage is very straightforward. Escapement will consider all root-level tags as separate elements.
24
24
 
25
25
  The position values are 0-based and are relative to the plain text result. The first value is the start of the attributed text, and the second is the end of the attributed text.
26
26
 
@@ -30,7 +30,19 @@ body = "<p>Isn't <i>Tourbillon</i> a <a href=\"http://google.com\">great</a> wor
30
30
  html = Escapement::HTML.new(body)
31
31
  html.extract!
32
32
  html.results
33
- # => [{:text=>"Isn't Tourbillon a great word?", :entities=>[{:type=>"italic", :html_tag=>"i", :position=>[6, 16], :attributes=>{}}, {:type=>"link", :html_tag=>"a", :position=>[19, 24], :attributes=>{"href"=>"http://google.com"}}]}]
33
+ # => [{:type=>"paragraph", :html_tag=>"p", :text=>"Isn't Tourbillon a great word?", :entities=>[{:type=>"italic", :html_tag=>"i", :position=>[6, 16], :attributes=>{}}, {:type=>"link", :html_tag=>"a", :position=>[19, 24], :attributes=>{"href"=>"http://google.com"}}]}]
34
+ ```
35
+
36
+ Escapement also supports lists (with nesting), which treats each list item as a separate paragraph-like element.
37
+
38
+ ``` ruby
39
+ body = "<ul><li>List item 1</li><ul><li><b>Nested</b> list item</li></ul><li>List item 2</li></ul>"
40
+
41
+ html = Escapement::HTML.new(body)
42
+ html.extract!
43
+ html.results
44
+
45
+ # => [{:type=>"unordered_list", :html_tag=>"ul", :children=>[{:type=>"list_item", :html_tag=>"li", :text=>"List item 1", :entities=>[]}, {:type=>"unordered_list", :html_tag=>"ul", :children=>[{:type=>"list_item", :html_tag=>"li", :text=>"Nested list item", :entities=>[{:type=>"bold", :html_tag=>"b", :position=>[0, 6], :attributes=>{}}]}]}, {:type=>"list_item", :html_tag=>"li", :text=>"List item 2", :entities=>[]}]}]
34
46
  ```
35
47
 
36
48
  ## How It Works
@@ -1,7 +1,8 @@
1
1
  require "nokogiri"
2
2
  require "escapement/traversal"
3
+ require "escapement/pretty_names"
3
4
  require "escapement/attributes"
4
- require "escapement/block"
5
+ require "escapement/element"
5
6
  require "escapement/html"
6
7
  require "escapement/tag"
7
8
  require "escapement/version"
@@ -0,0 +1,30 @@
1
+ require 'escapement/elements/base'
2
+ require 'escapement/elements/paragraph'
3
+ require 'escapement/elements/list'
4
+ require 'escapement/elements/ordered_list'
5
+ require 'escapement/elements/unordered_list'
6
+
7
+ module Escapement
8
+ # An element represents a root-level element in the given
9
+ # HTML string. Each paragraph has it's own text value and
10
+ # array of entities.
11
+ module Element
12
+ extend self
13
+
14
+ NODE_TYPES = [
15
+ Element::Paragraph,
16
+ Element::OrderedList,
17
+ Element::UnorderedList
18
+ ].freeze
19
+
20
+ def factory(node)
21
+ NODE_TYPES.each do |type|
22
+ if type.should_handle?(node)
23
+ return type.new(node)
24
+ end
25
+ end
26
+
27
+ nil
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,30 @@
1
+ module Escapement
2
+ module Element
3
+ class Base
4
+ include Traversal
5
+ include PrettyNames
6
+
7
+ attr_reader :node, :result
8
+
9
+ def initialize(node)
10
+ @node = node
11
+ @entities = []
12
+ @result = nil
13
+ @current_position = 0
14
+ end
15
+
16
+ def process!
17
+ raise "Override"
18
+ end
19
+
20
+ protected
21
+
22
+ def node_is_blank?
23
+ # This will match empty strings, strings with spaces, and
24
+ # even strings with unicode non-breakable spaces (which can be
25
+ # produced by &nbsp;)
26
+ node.text =~ /\A[[:space:]]*\z/
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,20 @@
1
+ module Escapement
2
+ module Element
3
+ module List
4
+ def process!
5
+ return if node_is_blank?
6
+
7
+ @entities = node.children.map { |child|
8
+ next if child.text?
9
+ Element.factory(child).tap(&:process!)
10
+ }.compact
11
+
12
+ @result = {
13
+ type: node_to_type,
14
+ html_tag: node.name,
15
+ children: @entities.map(&:result)
16
+ }
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,11 @@
1
+ module Escapement
2
+ module Element
3
+ class OrderedList < Base
4
+ include List
5
+
6
+ def self.should_handle?(node)
7
+ node.name == 'ol'
8
+ end
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,22 @@
1
+ module Escapement
2
+ module Element
3
+ class Paragraph < Base
4
+ def self.should_handle?(node)
5
+ ['p', 'li'].include? node.name
6
+ end
7
+
8
+ def process!
9
+ return if node_is_blank?
10
+
11
+ process_children
12
+
13
+ @result = {
14
+ type: node_to_type,
15
+ html_tag: node.name,
16
+ text: node.text,
17
+ entities: @entities
18
+ }
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,11 @@
1
+ module Escapement
2
+ module Element
3
+ class UnorderedList < Base
4
+ include List
5
+
6
+ def self.should_handle?(node)
7
+ node.name == 'ul'
8
+ end
9
+ end
10
+ end
11
+ end
@@ -1,22 +1,22 @@
1
1
  module Escapement
2
2
  # Wrapper around the entire document, which contains an array of
3
3
  # results. Each result is the text value and entities for a single
4
- # paragraph/block.
4
+ # element.
5
5
  class HTML
6
- attr_reader :doc, :blocks, :results
6
+ attr_reader :doc, :elements, :results
7
7
 
8
8
  def initialize(html)
9
9
  @doc = Nokogiri::HTML(html)
10
- @blocks = []
10
+ @elements = []
11
11
  @results = nil
12
12
  end
13
13
 
14
- # Extracts all of the entities for each paragraph/block.
14
+ # Extracts all of the entities for each element.
15
15
  def extract!
16
16
  preprocess!
17
17
 
18
- @blocks = doc.css('body').children.map { |child| Block.new(child).tap(&:process!) }
19
- @results = @blocks.reject { |b| b.result.nil? }.map(&:result)
18
+ @elements = doc.css('body').children.map { |child| Element.factory(child) }.compact
19
+ @results = @elements.each(&:process!).reject { |e| e.result.nil? }.map(&:result)
20
20
  end
21
21
 
22
22
  private
@@ -0,0 +1,24 @@
1
+ module Escapement
2
+ module PrettyNames
3
+ def node_to_type
4
+ case node.name
5
+ when 'p' then 'paragraph'
6
+ when 'ul' then 'unordered_list'
7
+ when 'ol' then 'ordered_list'
8
+ when 'a' then 'link'
9
+ when 'i', 'em' then 'italic'
10
+ when 'u' then 'underline'
11
+ when 'strong', 'b' then 'bold'
12
+ when 'abbr' then 'abbreviation'
13
+ when 'q' then 'quote'
14
+ when 'pre' then 'preformatted'
15
+ when 'img' then 'image'
16
+ when 'li' then 'list_item'
17
+ when 'sup' then 'superscript'
18
+ when 'sub' then 'subscript'
19
+ when /h\d/ then 'header'
20
+ else node.name
21
+ end
22
+ end
23
+ end
24
+ end
@@ -4,6 +4,7 @@ module Escapement
4
4
  # the traversal until we reach the leaf text node.
5
5
  class Tag
6
6
  include Traversal
7
+ include PrettyNames
7
8
 
8
9
  attr_reader :node, :entities
9
10
 
@@ -26,25 +27,6 @@ module Escapement
26
27
 
27
28
  private
28
29
 
29
- def node_to_type
30
- case node.name
31
- when 'p' then 'paragraph'
32
- when 'a' then 'link'
33
- when 'i', 'em' then 'italic'
34
- when 'u' then 'underline'
35
- when 'strong', 'b' then 'bold'
36
- when 'abbr' then 'abbreviation'
37
- when 'q' then 'quote'
38
- when 'pre' then 'preformatted'
39
- when 'img' then 'image'
40
- when 'li' then 'list_item'
41
- when 'sup' then 'superscript'
42
- when 'sub' then 'subscript'
43
- when /h\d/ then 'header'
44
- else node.name
45
- end
46
- end
47
-
48
30
  def filtered_attributes
49
31
  method_name = Attributes.respond_to?(node.name) ? node.name : :default
50
32
  node.attributes.select(&Attributes.method(method_name))
@@ -2,7 +2,7 @@ module Escapement
2
2
  module Traversal
3
3
  # Processes all child nodes of the current node. As the recursion unwinds, we
4
4
  # update the entities array such that we're left with a full result set at
5
- # the root, which is the Block object.
5
+ # the root, which is the Element object.
6
6
  def process_children
7
7
  node.children.each do |child|
8
8
  if child.text?
@@ -1,3 +1,3 @@
1
1
  module Escapement
2
- VERSION = "1.0.0"
2
+ VERSION = "2.0.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: escapement
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan LeFevre
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-06-30 00:00:00.000000000 Z
11
+ date: 2017-03-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -87,8 +87,14 @@ files:
87
87
  - escapement.gemspec
88
88
  - lib/escapement.rb
89
89
  - lib/escapement/attributes.rb
90
- - lib/escapement/block.rb
90
+ - lib/escapement/element.rb
91
+ - lib/escapement/elements/base.rb
92
+ - lib/escapement/elements/list.rb
93
+ - lib/escapement/elements/ordered_list.rb
94
+ - lib/escapement/elements/paragraph.rb
95
+ - lib/escapement/elements/unordered_list.rb
91
96
  - lib/escapement/html.rb
97
+ - lib/escapement/pretty_names.rb
92
98
  - lib/escapement/tag.rb
93
99
  - lib/escapement/traversal.rb
94
100
  - lib/escapement/version.rb
@@ -1,30 +0,0 @@
1
- module Escapement
2
- # A block represents a paragraph, which is a root-level element in the
3
- # given HTML string. Each paragraph has it's own text value and array of entities.
4
- class Block
5
- include Traversal
6
-
7
- attr_reader :node, :result
8
-
9
- def initialize(node)
10
- @node = node
11
- @entities = []
12
- @result = nil
13
- @current_position = 0
14
- end
15
-
16
- def process!
17
- # This will match empty strings, strings with spaces, and
18
- # even strings with unicode non-breakable spaces (which can be
19
- # produced by &nbsp;)
20
- return if node.text =~ /\A[[:space:]]*\z/
21
-
22
- process_children
23
-
24
- @result = {
25
- text: node.text,
26
- entities: @entities
27
- }
28
- end
29
- end
30
- end