html-hierarchy-extractor 1.0.2 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 90e26530c9a5d82ec576d31614a5107f0defa763
4
- data.tar.gz: ada6d22330888e48f4a5d274568e93552f964867
3
+ metadata.gz: 3f602c4f7bbce46be9791ed69ecbd2797dad12af
4
+ data.tar.gz: 925fab69f43102eb41a5ce2014be43c0145da96a
5
5
  SHA512:
6
- metadata.gz: 965435216e5844e62c248bff8b0e4aee5907094ba370ea642e4eedac0901e4b3f1ca3cf35cd2753d4c276a7032a2575ec2ae78786b7b645fd12c2a4ec26d6ddf
7
- data.tar.gz: e8a74ad80c0dac98abcb5a1e8ac07315389d3644bf0cf992af90e256bbc5feb2a10ac680681f07f9a7fc030c5e268248df425d40f20d2fc9d8a543030e0ae047
6
+ metadata.gz: c988218206a7a3461eddbf929c48dce69db4d1034c193106a93790b502c597f12eb60f620c523c67bbd720545c4f97dd4a74a6a118198569b7071b0e6d253a56
7
+ data.tar.gz: 958f6df05ed28f5b7bd5307eb930fa7ef362e2770ca4258594ad3d695edf9f4f19f097f792a1ab1d98bbef1350de16fd92e0ff2897b43016621e0b50f9659e8d
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-hierarchy-extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Carry
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-20 00:00:00.000000000 Z
11
+ date: 2017-11-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -30,42 +30,42 @@ dependencies:
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '1.8'
33
+ version: '2.0'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '1.8'
40
+ version: '2.0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: nokogiri
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.6'
47
+ version: '1.8'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.6'
54
+ version: '1.8'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: coveralls
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '0.8'
61
+ version: 0.8.21
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '0.8'
68
+ version: 0.8.21
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: flay
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -94,6 +94,34 @@ dependencies:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
96
  version: '4.3'
97
+ - !ruby/object:Gem::Dependency
98
+ name: guard
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '2.14'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '2.14'
111
+ - !ruby/object:Gem::Dependency
112
+ name: guard-rake
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.0'
97
125
  - !ruby/object:Gem::Dependency
98
126
  name: guard-rspec
99
127
  requirement: !ruby/object:Gem::Requirement
@@ -142,68 +170,37 @@ dependencies:
142
170
  requirements:
143
171
  - - "~>"
144
172
  - !ruby/object:Gem::Version
145
- version: '0.31'
173
+ version: '0.51'
146
174
  type: :development
147
175
  prerelease: false
148
176
  version_requirements: !ruby/object:Gem::Requirement
149
177
  requirements:
150
178
  - - "~>"
151
179
  - !ruby/object:Gem::Version
152
- version: '0.31'
180
+ version: '0.51'
153
181
  - !ruby/object:Gem::Dependency
154
182
  name: simplecov
155
183
  requirement: !ruby/object:Gem::Requirement
156
184
  requirements:
157
185
  - - "~>"
158
186
  - !ruby/object:Gem::Version
159
- version: '0.10'
187
+ version: 0.14.1
160
188
  type: :development
161
189
  prerelease: false
162
190
  version_requirements: !ruby/object:Gem::Requirement
163
191
  requirements:
164
192
  - - "~>"
165
193
  - !ruby/object:Gem::Version
166
- version: '0.10'
194
+ version: 0.14.1
167
195
  description: Take any arbitrary HTML as input and extract its hierarchy as a list
168
- of items, including parents and contents.It is primarily intended to be used along
196
+ of items, including parents and contents. It is primarily intended to be used along
169
197
  with Algolia, to improve the relevance of searching into huge chunks of text
170
198
  email: tim@pixelastic.com
171
199
  executables: []
172
200
  extensions: []
173
- extra_rdoc_files:
174
- - LICENSE.txt
175
- - README.md
176
- files:
177
- - ".coveralls.yml"
178
- - ".document"
179
- - ".rspec"
180
- - ".rubocop.yml"
181
- - ".travis.yml"
182
- - CONTRIBUTING.md
183
- - Gemfile
184
- - Guardfile
185
- - LICENSE.txt
186
- - README.md
187
- - Rakefile
188
- - VERSION
189
- - html-hierarchy-extractor.gemspec
190
- - lib/html-hierarchy-extractor.rb
191
- - lib/version.rb
192
- - scripts/bump_version
193
- - scripts/check_flay
194
- - scripts/check_flog
195
- - scripts/coverage
196
- - scripts/git_hooks/pre-commit
197
- - scripts/git_hooks/pre-push
198
- - scripts/lint
199
- - scripts/release
200
- - scripts/test
201
- - scripts/test_ci
202
- - scripts/watch
203
- - spec/html_hierarchy_extractor_spec.rb
204
- - spec/spec_helper.rb
205
- - spec/spec_helper_simplecov.rb
206
- homepage: http://github.com/pixelastic/html-hierarchy-extractor
201
+ extra_rdoc_files: []
202
+ files: []
203
+ homepage: https://github.com/pixelastic/html-hierarchy-extractor
207
204
  licenses:
208
205
  - MIT
209
206
  metadata: {}
@@ -223,7 +220,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
223
220
  version: '0'
224
221
  requirements: []
225
222
  rubyforge_project:
226
- rubygems_version: 2.4.8
223
+ rubygems_version: 2.5.1
227
224
  signing_key:
228
225
  specification_version: 4
229
226
  summary: Extract HTML hierarchy (headings and content) into a list of items
@@ -1 +0,0 @@
1
- service_name: travis-ci
data/.document DELETED
@@ -1,5 +0,0 @@
1
- lib/**/*.rb
2
- bin/*
3
- -
4
- features/**/*.feature
5
- LICENSE.txt
data/.rspec DELETED
@@ -1,2 +0,0 @@
1
- --color
2
- --format progress
@@ -1,26 +0,0 @@
1
- # Defaults:
2
- # https://github.com/bbatsov/rubocop/blob/master/config/default.yml
3
- Metrics/AbcSize:
4
- Max: 100
5
-
6
- Metrics/ClassLength:
7
- Max: 200
8
-
9
- Metrics/ModuleLength:
10
- Max: 200
11
-
12
- Metrics/MethodLength:
13
- Max: 50
14
-
15
- Metrics/CyclomaticComplexity:
16
- Max: 10
17
-
18
- Metrics/PerceivedComplexity:
19
- Max: 10
20
-
21
- Style/FileName:
22
- Enabled: false
23
-
24
- Style/MultilineOperationIndentation:
25
- Enabled: false
26
-
@@ -1,12 +0,0 @@
1
- language: ruby
2
- cache: bundler
3
- before_script: bundle update
4
- script: ./scripts/test_ci
5
- rvm:
6
- - 2.2
7
- - 2.1
8
- - 2.0
9
- notifications:
10
- email:
11
- on_success: never
12
- on_failure: never
@@ -1,53 +0,0 @@
1
- Hi collaborator!
2
-
3
- If you have a fix or a new feature, please start by checking in the
4
- [issues](https://github.com/pixelastic/html-hierarchy-extractor/issues) if it is
5
- already referenced. If not, feel free to open one.
6
-
7
- We use [pull requests](https://github.com/pixelastic/html-hierarchy-extractor/pulls)
8
- for collaboration. The workflow is as follow:
9
-
10
- - Create a local branch, starting from `develop`
11
- - Submit the PR on `develop`
12
- - Wait for review
13
- - Do the changes requested (if any)
14
- - We may ask you to rebase the branch to latest `develop` if it gets out of sync
15
- - Get praise for your awesome contribution
16
-
17
- # Development workflow
18
-
19
- Run `bundle install` to get all dependencies up to date.
20
-
21
- You can then launch:
22
-
23
- - `./scripts/test` to launch tests
24
- - `./scripts/watch` to start a test watcher (for TDD) using Guard
25
-
26
- If you plan on submitting a PR, I suggest you install the git hooks. This will
27
- run pre-commit and pre-push checks. Those checks will also be run by TravisCI,
28
- but running them locally gives faster feedback.
29
-
30
- If you want to a local version of the gem in your local project, I suggest
31
- updating your project `Gemfile` to point to the correct local directory
32
-
33
- ```ruby
34
- gem "html-hierarchy-extractor", :path => "/path/to/local/gem/folder"
35
- ```
36
-
37
- You should also run `rake gemspec` from the `html-hierarchy-extractor`
38
- repository the first time and if you added/deleted any file or dependency.
39
-
40
- # Tagging and releasing
41
-
42
- This part is for main contributors:
43
-
44
- ```
45
- # Bump the version (in develop)
46
- ./scripts/bump_version minor
47
-
48
- # Update master and release
49
- ./scripts/release
50
-
51
- # Install the gem locally (optional)
52
- rake install
53
- ```
data/Gemfile DELETED
@@ -1,16 +0,0 @@
1
- source 'http://rubygems.org'
2
-
3
- gem 'awesome_print', '~> 1.6'
4
- gem 'json', '~> 1.8'
5
- gem 'nokogiri', '~> 1.6'
6
-
7
- group :development do
8
- gem 'coveralls', '~> 0.8'
9
- gem 'flay', '~> 2.6'
10
- gem 'flog', '~> 4.3'
11
- gem 'guard-rspec', '~> 4.6'
12
- gem 'jeweler', '~> 2.0'
13
- gem 'rspec', '~> 3.0'
14
- gem 'rubocop', '~> 0.31'
15
- gem 'simplecov', '~> 0.10'
16
- end
data/Guardfile DELETED
@@ -1,7 +0,0 @@
1
- guard :rspec, cmd: 'bundle exec rspec --color --format documentation' do
2
- watch(%r{^spec/.+_spec\.rb$})
3
- watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
4
- watch('spec/spec_helper.rb') { 'spec' }
5
- end
6
-
7
- notification :off
@@ -1,20 +0,0 @@
1
- Copyright (c) 2016 Pixelastic
2
-
3
- Permission is hereby granted, free of charge, to any person obtaining
4
- a copy of this software and associated documentation files (the
5
- "Software"), to deal in the Software without restriction, including
6
- without limitation the rights to use, copy, modify, merge, publish,
7
- distribute, sublicense, and/or sell copies of the Software, and to
8
- permit persons to whom the Software is furnished to do so, subject to
9
- the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be
12
- included in all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md DELETED
@@ -1,141 +0,0 @@
1
- # html-hierarchy-extractor
2
-
3
- This gems lets you extract the hierarchy of headings and content from any HTML
4
- page into an array of elements.
5
-
6
- Intended to be used with [Algolia][1] to improve relevance of search
7
- results inside large HTML pages. The records created are compatible with the
8
- [DocSearch][2] format.
9
-
10
- ## Installation
11
-
12
- ```ruby
13
- # Gemfile
14
- source 'http://rubygems.org'
15
-
16
- gem 'html-hierarchy-extractor', '~> 1.0'
17
- ```
18
-
19
- ## How to use
20
-
21
- ```ruby
22
- require 'html-hierarchy-extractor'
23
-
24
- content = File.read('./index.html')
25
- page = HTMLHierarchyExtractor.new(content)
26
- records = page.extract
27
- puts records
28
- ```
29
-
30
- ## Records
31
-
32
- `extract` will return an array of records. Each record will represent a `<p>`
33
- paragraph of the initial text, along with it textual version (HTML removed),
34
- heading hierarchy, and other interesting bits.
35
-
36
- ## Example
37
-
38
- Let's take the following HTML as input and see what records we got as output:
39
-
40
- ```html
41
- <!doctype html>
42
- <html>
43
- <body>
44
- <h1 name="journey">The Hero's Journey</h1>
45
- <p>Most stories always follow the same pattern.</p>
46
- <h2 name="departure">Part One: Departure</h2>
47
- <p>A story starts in a mundane world, and helps identify the hero. It helps puts all the achievements of the story into perspective.</p>
48
- <h3 name="calladventure">The call to Adventure</h3>
49
- <p>Some out-of-the-ordinary event pushes the hero to start his journey.</p>
50
- <h3 name="threshold">Crossing the Threshold</h3>
51
- <p>The hero quits his job, hit the road, or whatever cuts him from his previous life.</p>
52
- <h2 name="initiations">Part Two: Initiation</h2>
53
- <h3 name="trials">The Road of Trials</h3>
54
- <p>The road is filled with dangers. The hero as to find his inner strength to overcome them.</p>
55
- <h3 name="ultimate">The Ultimate Boon</h3>
56
- <p>The hero has found something, either physical or metaphorical that changes him.</p>
57
- <h2 name="return">Part Three: Return</h2>
58
- <h3 name="refusal">Refusal to Return</h3>
59
- <p>The hero does not want to go back to his previous life at first. But then, an event will make him change his mind.</p>
60
- <h3 name="master">Master of Two Worlds</h3>
61
- <p>Armed with his new power/weapon, the hero can go back to its initial world and fix all the issues he had there.</p>
62
- </body>
63
- </html>
64
- ```
65
-
66
- Here is one of the records extracted:
67
-
68
- ```ruby
69
- {
70
- :uuid => "1f5923d5a60e998704f201bbe9964811",
71
- :tag_name => "p",
72
- :html => "<p>The hero quit his jobs, hit the road, or whatever cuts him from his previous life.</p>",
73
- :text => "The hero quit his jobs, hit the road, or whatever cuts him from his previous life.",
74
- :node => #<Nokogiri::XML::Element:0x11a5850 name="p">,
75
- :anchor => nil,
76
- :hierarchy => {
77
- :lvl0 => "The Hero's Journey",
78
- :lvl1 => "Part One: Departure",
79
- :lvl2 => "Crossing the Threshold",
80
- :lvl3 => nil,
81
- :lvl4 => nil,
82
- :lvl5 => nil,
83
- :lvl6 => nil
84
- },
85
- :weight => {
86
- :heading => 70,
87
- :position => 3
88
- }
89
- }
90
- ```
91
-
92
- Each record has a `uuid` that uniquely identify it (computed by a hash of all
93
- the other values).
94
-
95
- It also contains the HTML tag name in `tag_name` (by default `<p>`
96
- paragraphs are extracted, but see the [settings][3] on how to change it).
97
-
98
- `html` contains the whole `outerContent` of the element, including the wrapping
99
- tags and inner children. The `text` attribute contains the textual content,
100
- stripping out all HTML.
101
-
102
- `node` contains the [Nokogiri node][4] instance. The lib uses it internally to
103
- extract all the relevant information ut is also exposed if you want to process
104
- the node further.
105
-
106
- The `anchor` attributes contains the HTML anchor closest to the element. Here it
107
- is `threshold` because this is the closest anchor in the hierarchy above.
108
- Anchors are searched in `name` and `id` attributes of headings.
109
-
110
- `hierarchy` then contains a snapshot of the current heading hierarchy of the
111
- paragraph. The `lvlX` syntax is used to be compatible with the records
112
- [DocSearch][5] is using.
113
-
114
- The `weight` attribute is used to provide an easy way to rank two records
115
- relative to each other.
116
-
117
- - `heading` gives the depth level in the hierarchy where the record is. Records
118
- on top level will have a value of 100, those under a `h1` will have 90, and so
119
- on. Because our record is under a `h3`, it has 70.
120
- - `position` is the position of the paragraph in the page. Here our paragraph is
121
- the fourth paragraph of the page, so it will have a `position` of 3. It can
122
- help you give more weight to the first items in the page.
123
-
124
- ## Settings
125
-
126
- When instanciating `HTMLHierarchyExtractor`, you can pass a secondary `options`
127
- argument. This attribute accepts one value, `css_selector`.
128
-
129
- ```ruby
130
- page = HTMLHierarchyExtractor.new(content, { css_selector: 'p,li' })
131
- ```
132
-
133
- This lets you change the default selector. Here instead of `<p>` paragraph,
134
- the library will extract `<li>` list elements as well.
135
-
136
-
137
- [1]: https://www.algolia.com/
138
- [2]: https://community.algolia.com/docsearch/
139
- [3]: #Settings
140
- [4]: http://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/Node
141
- [5]: https://community.algolia.com/docsearch/