escapement 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7756a2e57d00b9d7842a0f09fb4396755144b620
4
+ data.tar.gz: e04508786e9c91d6778f4f1e97a4386c79025f33
5
+ SHA512:
6
+ metadata.gz: 65e3b2b84fb12da56c5f979764776b65aac28e992246b4ff601603fe51624d9623dc0287ffeb94f8993ecc531f2a2974bc41bbabf2f0538aa8e5cd9c1e753d2b
7
+ data.tar.gz: 744c837d2c0477c353fe604e0b192f31df63894ee2403b70a6e28323d7253699bc02613fb841e4700457dc02c4e2b0e11102957f163986181533c7542cfdaee3
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format progress
2
+ --color
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in escapement.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,52 @@
1
+ # Escapement
2
+
3
+ Given a HTML formatted string, escapement will extract descendant tags into a device agnostic attributes array that can be used for formatting the text anywhere.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'escapement'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install escapement
20
+
21
+ ## Usage
22
+
23
+ Basic usage is very straightforward. Escapement will consider all root-level tags as separate paragraphs.
24
+
25
+ The position values are 0-based and are relative to the plain text result. The first value is the start of the attributed text, and the second is the end of the attributed text.
26
+
27
+ ``` ruby
28
+ body = "<p>Isn't <i>Tourbillon</i> a <a href=\"http://google.com\">great</a> word?</p>"
29
+
30
+ html = Escapement::HTML.new(body)
31
+ html.extract!
32
+ html.results
33
+ # => [{:text=>"Isn't Tourbillon a great word?", :entities=>[{:type=>"italic", :html_tag=>"i", :position=>[6, 16], :attributes=>{}}, {:type=>"link", :html_tag=>"a", :position=>[19, 24], :attributes=>{"href"=>"http://google.com"}}]}]
34
+ ```
35
+
36
+ ## How It Works
37
+
38
+ From a high level, Escapement uses [Nokogiri](https://github.com/sparklemotion/nokogiri) to recursively traverse the DOM tree. As it traverses, it keeps track of the current position of the node relative to the text content in order to determine entity position. There are no regular expression hacks involved.
39
+
40
+ ## Development
41
+
42
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
43
+
44
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release` to create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
45
+
46
+ ## Contributing
47
+
48
+ 1. Fork it ( https://github.com/[my-github-username]/escapement/fork )
49
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
50
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
51
+ 4. Push to the branch (`git push origin my-new-feature`)
52
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "escapement"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'escapement/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "escapement"
8
+ spec.version = Escapement::VERSION
9
+ spec.authors = ["Ryan LeFevre"]
10
+ spec.email = ["ryan@hodinkee.com"]
11
+
12
+ spec.summary = %q{Extract child entities from an HTML string.}
13
+ spec.description = %q{Given a HTML formatted string, escapement will extract descendant tags into a device agnostic attributes array that can be used for formatting the text anywhere.}
14
+ spec.homepage = "https://github.com/hodinkee/escapement"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
+ spec.bindir = "exe"
18
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.required_ruby_version = ">= 2.0.0"
22
+
23
+ spec.add_dependency "nokogiri", "~> 1.6"
24
+
25
+ spec.add_development_dependency "bundler", "~> 1.9"
26
+ spec.add_development_dependency "rake", "~> 10.0"
27
+ spec.add_development_dependency "rspec", "~> 3"
28
+ end
@@ -0,0 +1,19 @@
1
+ module Escapement
2
+ # These methods filter the allowed attributes on entities in order to cut
3
+ # down on the noise returned with the results.
4
+ module Attributes extend self
5
+ # By default we allow no attributes in order to cut down on noise as
6
+ # much as possible. Attributes can be whitelisted on a per-tag basis.
7
+ def default(key, value)
8
+ false
9
+ end
10
+
11
+ def a(key, value)
12
+ true if key == "href"
13
+ end
14
+
15
+ def img(key, value)
16
+ true if ['src', 'width', 'height'].include?(key)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,25 @@
1
+ module Escapement
2
+ # A block represents a paragraph, which is a root-level element in the
3
+ # given HTML string. Each paragraph has it's own text value and array of entities.
4
+ class Block
5
+ include Traversal
6
+
7
+ attr_reader :node, :result
8
+
9
+ def initialize(node)
10
+ @node = node
11
+ @entities = []
12
+ @result = nil
13
+ @current_position = 0
14
+ end
15
+
16
+ def process!
17
+ process_children
18
+
19
+ @result = {
20
+ text: node.text,
21
+ entities: @entities
22
+ }
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,20 @@
1
+ module Escapement
2
+ # Wrapper around the entire document, which contains an array of
3
+ # results. Each result is the text value and entities for a single
4
+ # paragraph/block.
5
+ class HTML
6
+ attr_reader :doc, :blocks, :results
7
+
8
+ def initialize(html)
9
+ @doc = Nokogiri::HTML(html)
10
+ @blocks = []
11
+ @results = nil
12
+ end
13
+
14
+ # Extracts all of the entities for each paragraph/block.
15
+ def extract!
16
+ @blocks = doc.css('body').children.map { |child| Block.new(child).tap(&:process!) }
17
+ @results = @blocks.map(&:result)
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,51 @@
1
+ module Escapement
2
+ # A tag represents an entity that may or may not have child elements.
3
+ # Once we extract the data about this DOM node, we recursively continue
4
+ # the traversal until we reach the leaf text node.
5
+ class Tag
6
+ include Traversal
7
+
8
+ attr_reader :node, :entities
9
+
10
+ def initialize(node, start_position)
11
+ @node = node
12
+ @start_position = @current_position = start_position
13
+ @entities = []
14
+ end
15
+
16
+ def process
17
+ @entities << {
18
+ type: node_to_type,
19
+ html_tag: node.name,
20
+ position: [@current_position, @current_position + node.text.length],
21
+ attributes: Hash[filtered_attributes.map { |k, v| [k, v.value] }]
22
+ }
23
+
24
+ process_children
25
+ end
26
+
27
+ private
28
+
29
+ def node_to_type
30
+ case node.name
31
+ when 'p' then 'paragraph'
32
+ when 'a' then 'link'
33
+ when 'i', 'em' then 'italic'
34
+ when 'u' then 'underline'
35
+ when 'strong', 'b' then 'bold'
36
+ when 'abbr' then 'abbreviation'
37
+ when 'q' then 'quote'
38
+ when 'pre' then 'preformatted'
39
+ when 'img' then 'image'
40
+ when 'li' then 'list_item'
41
+ when /h\d/ then 'header'
42
+ else node.name
43
+ end
44
+ end
45
+
46
+ def filtered_attributes
47
+ method_name = Attributes.respond_to?(node.name) ? node.name : :default
48
+ node.attributes.select(&Attributes.method(method_name))
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,24 @@
1
+ module Escapement
2
+ module Traversal
3
+ # Processes all child nodes of the current node. As the recursion unwinds, we
4
+ # update the entities array such that we're left with a full result set at
5
+ # the root, which is the Block object.
6
+ def process_children
7
+ node.children.each do |child|
8
+ if child.text?
9
+ # If the child node is a text node, we know there are no entities. We simply
10
+ # increase the current position and continue.
11
+ @current_position += child.content.length
12
+ else
13
+ # The node is not a text node, so it must be an entity of some kind. Continue
14
+ # the recursion.
15
+ tag = Escapement::Tag.new(child, @current_position)
16
+ tag.process
17
+
18
+ @current_position += child.content.length
19
+ @entities.concat tag.entities
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ module Escapement
2
+ VERSION = "0.1.0"
3
+ end
data/lib/escapement.rb ADDED
@@ -0,0 +1,7 @@
1
+ require "nokogiri"
2
+ require "escapement/traversal"
3
+ require "escapement/attributes"
4
+ require "escapement/block"
5
+ require "escapement/html"
6
+ require "escapement/tag"
7
+ require "escapement/version"
metadata ADDED
@@ -0,0 +1,116 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: escapement
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ryan LeFevre
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2015-07-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.9'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.9'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '10.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '3'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '3'
69
+ description: Given a HTML formatted string, escapement will extract descendant tags
70
+ into a device agnostic attributes array that can be used for formatting the text
71
+ anywhere.
72
+ email:
73
+ - ryan@hodinkee.com
74
+ executables: []
75
+ extensions: []
76
+ extra_rdoc_files: []
77
+ files:
78
+ - ".gitignore"
79
+ - ".rspec"
80
+ - Gemfile
81
+ - README.md
82
+ - Rakefile
83
+ - bin/console
84
+ - bin/setup
85
+ - escapement.gemspec
86
+ - lib/escapement.rb
87
+ - lib/escapement/attributes.rb
88
+ - lib/escapement/block.rb
89
+ - lib/escapement/html.rb
90
+ - lib/escapement/tag.rb
91
+ - lib/escapement/traversal.rb
92
+ - lib/escapement/version.rb
93
+ homepage: https://github.com/hodinkee/escapement
94
+ licenses: []
95
+ metadata: {}
96
+ post_install_message:
97
+ rdoc_options: []
98
+ require_paths:
99
+ - lib
100
+ required_ruby_version: !ruby/object:Gem::Requirement
101
+ requirements:
102
+ - - ">="
103
+ - !ruby/object:Gem::Version
104
+ version: 2.0.0
105
+ required_rubygems_version: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - ">="
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ requirements: []
111
+ rubyforge_project:
112
+ rubygems_version: 2.4.6
113
+ signing_key:
114
+ specification_version: 4
115
+ summary: Extract child entities from an HTML string.
116
+ test_files: []