escapement 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/README.md +52 -0
- data/Rakefile +1 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/escapement.gemspec +28 -0
- data/lib/escapement/attributes.rb +19 -0
- data/lib/escapement/block.rb +25 -0
- data/lib/escapement/html.rb +20 -0
- data/lib/escapement/tag.rb +51 -0
- data/lib/escapement/traversal.rb +24 -0
- data/lib/escapement/version.rb +3 -0
- data/lib/escapement.rb +7 -0
- metadata +116 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 7756a2e57d00b9d7842a0f09fb4396755144b620
|
4
|
+
data.tar.gz: e04508786e9c91d6778f4f1e97a4386c79025f33
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 65e3b2b84fb12da56c5f979764776b65aac28e992246b4ff601603fe51624d9623dc0287ffeb94f8993ecc531f2a2974bc41bbabf2f0538aa8e5cd9c1e753d2b
|
7
|
+
data.tar.gz: 744c837d2c0477c353fe604e0b192f31df63894ee2403b70a6e28323d7253699bc02613fb841e4700457dc02c4e2b0e11102957f163986181533c7542cfdaee3
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# Escapement
|
2
|
+
|
3
|
+
Given a HTML formatted string, escapement will extract descendant tags into a device agnostic attributes array that can be used for formatting the text anywhere.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'escapement'
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install escapement
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
Basic usage is very straightforward. Escapement will consider all root-level tags as separate paragraphs.
|
24
|
+
|
25
|
+
The position values are 0-based and are relative to the plain text result. The first value is the start of the attributed text, and the second is the end of the attributed text.
|
26
|
+
|
27
|
+
``` ruby
|
28
|
+
body = "<p>Isn't <i>Tourbillon</i> a <a href=\"http://google.com\">great</a> word?</p>"
|
29
|
+
|
30
|
+
html = Escapement::HTML.new(body)
|
31
|
+
html.extract!
|
32
|
+
html.results
|
33
|
+
# => [{:text=>"Isn't Tourbillon a great word?", :entities=>[{:type=>"italic", :html_tag=>"i", :position=>[6, 16], :attributes=>{}}, {:type=>"link", :html_tag=>"a", :position=>[19, 24], :attributes=>{"href"=>"http://google.com"}}]}]
|
34
|
+
```
|
35
|
+
|
36
|
+
## How It Works
|
37
|
+
|
38
|
+
From a high level, Escapement uses [Nokogiri](https://github.com/sparklemotion/nokogiri) to recursively traverse the DOM tree. As it traverses, it keeps track of the current position of the node relative to the text content in order to determine entity position. There are no regular expression hacks involved.
|
39
|
+
|
40
|
+
## Development
|
41
|
+
|
42
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
|
43
|
+
|
44
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
45
|
+
|
46
|
+
## Contributing
|
47
|
+
|
48
|
+
1. Fork it ( https://github.com/[my-github-username]/escapement/fork )
|
49
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
50
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
51
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
52
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "escapement"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/escapement.gemspec
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'escapement/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "escapement"
|
8
|
+
spec.version = Escapement::VERSION
|
9
|
+
spec.authors = ["Ryan LeFevre"]
|
10
|
+
spec.email = ["ryan@hodinkee.com"]
|
11
|
+
|
12
|
+
spec.summary = %q{Extract child entities from an HTML string.}
|
13
|
+
spec.description = %q{Given a HTML formatted string, escapement will extract descendant tags into a device agnostic attributes array that can be used for formatting the text anywhere.}
|
14
|
+
spec.homepage = "https://github.com/hodinkee/escapement"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
17
|
+
spec.bindir = "exe"
|
18
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.required_ruby_version = ">= 2.0.0"
|
22
|
+
|
23
|
+
spec.add_dependency "nokogiri", "~> 1.6"
|
24
|
+
|
25
|
+
spec.add_development_dependency "bundler", "~> 1.9"
|
26
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
27
|
+
spec.add_development_dependency "rspec", "~> 3"
|
28
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Escapement
|
2
|
+
# These methods filter the allowed attributes on entities in order to cut
|
3
|
+
# down on the noise returned with the results.
|
4
|
+
module Attributes extend self
|
5
|
+
# By default we allow no attributes in order to cut down on noise as
|
6
|
+
# much as possible. Attributes can be whitelisted on a per-tag basis.
|
7
|
+
def default(key, value)
|
8
|
+
false
|
9
|
+
end
|
10
|
+
|
11
|
+
def a(key, value)
|
12
|
+
true if key == "href"
|
13
|
+
end
|
14
|
+
|
15
|
+
def img(key, value)
|
16
|
+
true if ['src', 'width', 'height'].include?(key)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Escapement
|
2
|
+
# A block represents a paragraph, which is a root-level element in the
|
3
|
+
# given HTML string. Each paragraph has it's own text value and array of entities.
|
4
|
+
class Block
|
5
|
+
include Traversal
|
6
|
+
|
7
|
+
attr_reader :node, :result
|
8
|
+
|
9
|
+
def initialize(node)
|
10
|
+
@node = node
|
11
|
+
@entities = []
|
12
|
+
@result = nil
|
13
|
+
@current_position = 0
|
14
|
+
end
|
15
|
+
|
16
|
+
def process!
|
17
|
+
process_children
|
18
|
+
|
19
|
+
@result = {
|
20
|
+
text: node.text,
|
21
|
+
entities: @entities
|
22
|
+
}
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Escapement
|
2
|
+
# Wrapper around the entire document, which contains an array of
|
3
|
+
# results. Each result is the text value and entities for a single
|
4
|
+
# paragraph/block.
|
5
|
+
class HTML
|
6
|
+
attr_reader :doc, :blocks, :results
|
7
|
+
|
8
|
+
def initialize(html)
|
9
|
+
@doc = Nokogiri::HTML(html)
|
10
|
+
@blocks = []
|
11
|
+
@results = nil
|
12
|
+
end
|
13
|
+
|
14
|
+
# Extracts all of the entities for each paragraph/block.
|
15
|
+
def extract!
|
16
|
+
@blocks = doc.css('body').children.map { |child| Block.new(child).tap(&:process!) }
|
17
|
+
@results = @blocks.map(&:result)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Escapement
|
2
|
+
# A tag represents an entity that may or may not have child elements.
|
3
|
+
# Once we extract the data about this DOM node, we recursively continue
|
4
|
+
# the traversal until we reach the leaf text node.
|
5
|
+
class Tag
|
6
|
+
include Traversal
|
7
|
+
|
8
|
+
attr_reader :node, :entities
|
9
|
+
|
10
|
+
def initialize(node, start_position)
|
11
|
+
@node = node
|
12
|
+
@start_position = @current_position = start_position
|
13
|
+
@entities = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def process
|
17
|
+
@entities << {
|
18
|
+
type: node_to_type,
|
19
|
+
html_tag: node.name,
|
20
|
+
position: [@current_position, @current_position + node.text.length],
|
21
|
+
attributes: Hash[filtered_attributes.map { |k, v| [k, v.value] }]
|
22
|
+
}
|
23
|
+
|
24
|
+
process_children
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def node_to_type
|
30
|
+
case node.name
|
31
|
+
when 'p' then 'paragraph'
|
32
|
+
when 'a' then 'link'
|
33
|
+
when 'i', 'em' then 'italic'
|
34
|
+
when 'u' then 'underline'
|
35
|
+
when 'strong', 'b' then 'bold'
|
36
|
+
when 'abbr' then 'abbreviation'
|
37
|
+
when 'q' then 'quote'
|
38
|
+
when 'pre' then 'preformatted'
|
39
|
+
when 'img' then 'image'
|
40
|
+
when 'li' then 'list_item'
|
41
|
+
when /h\d/ then 'header'
|
42
|
+
else node.name
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def filtered_attributes
|
47
|
+
method_name = Attributes.respond_to?(node.name) ? node.name : :default
|
48
|
+
node.attributes.select(&Attributes.method(method_name))
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Escapement
|
2
|
+
module Traversal
|
3
|
+
# Processes all child nodes of the current node. As the recursion unwinds, we
|
4
|
+
# update the entities array such that we're left with a full result set at
|
5
|
+
# the root, which is the Block object.
|
6
|
+
def process_children
|
7
|
+
node.children.each do |child|
|
8
|
+
if child.text?
|
9
|
+
# If the child node is a text node, we know there are no entities. We simply
|
10
|
+
# increase the current position and continue.
|
11
|
+
@current_position += child.content.length
|
12
|
+
else
|
13
|
+
# The node is not a text node, so it must be an entity of some kind. Continue
|
14
|
+
# the recursion.
|
15
|
+
tag = Escapement::Tag.new(child, @current_position)
|
16
|
+
tag.process
|
17
|
+
|
18
|
+
@current_position += child.content.length
|
19
|
+
@entities.concat tag.entities
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
data/lib/escapement.rb
ADDED
metadata
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: escapement
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ryan LeFevre
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-07-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.9'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.9'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '10.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '10.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '3'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '3'
|
69
|
+
description: Given a HTML formatted string, escapement will extract descendant tags
|
70
|
+
into a device agnostic attributes array that can be used for formatting the text
|
71
|
+
anywhere.
|
72
|
+
email:
|
73
|
+
- ryan@hodinkee.com
|
74
|
+
executables: []
|
75
|
+
extensions: []
|
76
|
+
extra_rdoc_files: []
|
77
|
+
files:
|
78
|
+
- ".gitignore"
|
79
|
+
- ".rspec"
|
80
|
+
- Gemfile
|
81
|
+
- README.md
|
82
|
+
- Rakefile
|
83
|
+
- bin/console
|
84
|
+
- bin/setup
|
85
|
+
- escapement.gemspec
|
86
|
+
- lib/escapement.rb
|
87
|
+
- lib/escapement/attributes.rb
|
88
|
+
- lib/escapement/block.rb
|
89
|
+
- lib/escapement/html.rb
|
90
|
+
- lib/escapement/tag.rb
|
91
|
+
- lib/escapement/traversal.rb
|
92
|
+
- lib/escapement/version.rb
|
93
|
+
homepage: https://github.com/hodinkee/escapement
|
94
|
+
licenses: []
|
95
|
+
metadata: {}
|
96
|
+
post_install_message:
|
97
|
+
rdoc_options: []
|
98
|
+
require_paths:
|
99
|
+
- lib
|
100
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - ">="
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: 2.0.0
|
105
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
requirements: []
|
111
|
+
rubyforge_project:
|
112
|
+
rubygems_version: 2.4.6
|
113
|
+
signing_key:
|
114
|
+
specification_version: 4
|
115
|
+
summary: Extract child entities from an HTML string.
|
116
|
+
test_files: []
|