html-hierarchy-extractor 1.0.2 → 1.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 90e26530c9a5d82ec576d31614a5107f0defa763
4
- data.tar.gz: ada6d22330888e48f4a5d274568e93552f964867
3
+ metadata.gz: 3f602c4f7bbce46be9791ed69ecbd2797dad12af
4
+ data.tar.gz: 925fab69f43102eb41a5ce2014be43c0145da96a
5
5
  SHA512:
6
- metadata.gz: 965435216e5844e62c248bff8b0e4aee5907094ba370ea642e4eedac0901e4b3f1ca3cf35cd2753d4c276a7032a2575ec2ae78786b7b645fd12c2a4ec26d6ddf
7
- data.tar.gz: e8a74ad80c0dac98abcb5a1e8ac07315389d3644bf0cf992af90e256bbc5feb2a10ac680681f07f9a7fc030c5e268248df425d40f20d2fc9d8a543030e0ae047
6
+ metadata.gz: c988218206a7a3461eddbf929c48dce69db4d1034c193106a93790b502c597f12eb60f620c523c67bbd720545c4f97dd4a74a6a118198569b7071b0e6d253a56
7
+ data.tar.gz: 958f6df05ed28f5b7bd5307eb930fa7ef362e2770ca4258594ad3d695edf9f4f19f097f792a1ab1d98bbef1350de16fd92e0ff2897b43016621e0b50f9659e8d
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html-hierarchy-extractor
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.2
4
+ version: 1.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tim Carry
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-20 00:00:00.000000000 Z
11
+ date: 2017-11-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -30,42 +30,42 @@ dependencies:
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '1.8'
33
+ version: '2.0'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '1.8'
40
+ version: '2.0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: nokogiri
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '1.6'
47
+ version: '1.8'
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '1.6'
54
+ version: '1.8'
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: coveralls
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: '0.8'
61
+ version: 0.8.21
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: '0.8'
68
+ version: 0.8.21
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: flay
71
71
  requirement: !ruby/object:Gem::Requirement
@@ -94,6 +94,34 @@ dependencies:
94
94
  - - "~>"
95
95
  - !ruby/object:Gem::Version
96
96
  version: '4.3'
97
+ - !ruby/object:Gem::Dependency
98
+ name: guard
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: '2.14'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: '2.14'
111
+ - !ruby/object:Gem::Dependency
112
+ name: guard-rake
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.0'
97
125
  - !ruby/object:Gem::Dependency
98
126
  name: guard-rspec
99
127
  requirement: !ruby/object:Gem::Requirement
@@ -142,68 +170,37 @@ dependencies:
142
170
  requirements:
143
171
  - - "~>"
144
172
  - !ruby/object:Gem::Version
145
- version: '0.31'
173
+ version: '0.51'
146
174
  type: :development
147
175
  prerelease: false
148
176
  version_requirements: !ruby/object:Gem::Requirement
149
177
  requirements:
150
178
  - - "~>"
151
179
  - !ruby/object:Gem::Version
152
- version: '0.31'
180
+ version: '0.51'
153
181
  - !ruby/object:Gem::Dependency
154
182
  name: simplecov
155
183
  requirement: !ruby/object:Gem::Requirement
156
184
  requirements:
157
185
  - - "~>"
158
186
  - !ruby/object:Gem::Version
159
- version: '0.10'
187
+ version: 0.14.1
160
188
  type: :development
161
189
  prerelease: false
162
190
  version_requirements: !ruby/object:Gem::Requirement
163
191
  requirements:
164
192
  - - "~>"
165
193
  - !ruby/object:Gem::Version
166
- version: '0.10'
194
+ version: 0.14.1
167
195
  description: Take any arbitrary HTML as input and extract its hierarchy as a list
168
- of items, including parents and contents.It is primarily intended to be used along
196
+ of items, including parents and contents. It is primarily intended to be used along
169
197
  with Algolia, to improve the relevance of searching into huge chunks of text
170
198
  email: tim@pixelastic.com
171
199
  executables: []
172
200
  extensions: []
173
- extra_rdoc_files:
174
- - LICENSE.txt
175
- - README.md
176
- files:
177
- - ".coveralls.yml"
178
- - ".document"
179
- - ".rspec"
180
- - ".rubocop.yml"
181
- - ".travis.yml"
182
- - CONTRIBUTING.md
183
- - Gemfile
184
- - Guardfile
185
- - LICENSE.txt
186
- - README.md
187
- - Rakefile
188
- - VERSION
189
- - html-hierarchy-extractor.gemspec
190
- - lib/html-hierarchy-extractor.rb
191
- - lib/version.rb
192
- - scripts/bump_version
193
- - scripts/check_flay
194
- - scripts/check_flog
195
- - scripts/coverage
196
- - scripts/git_hooks/pre-commit
197
- - scripts/git_hooks/pre-push
198
- - scripts/lint
199
- - scripts/release
200
- - scripts/test
201
- - scripts/test_ci
202
- - scripts/watch
203
- - spec/html_hierarchy_extractor_spec.rb
204
- - spec/spec_helper.rb
205
- - spec/spec_helper_simplecov.rb
206
- homepage: http://github.com/pixelastic/html-hierarchy-extractor
201
+ extra_rdoc_files: []
202
+ files: []
203
+ homepage: https://github.com/pixelastic/html-hierarchy-extractor
207
204
  licenses:
208
205
  - MIT
209
206
  metadata: {}
@@ -223,7 +220,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
223
220
  version: '0'
224
221
  requirements: []
225
222
  rubyforge_project:
226
- rubygems_version: 2.4.8
223
+ rubygems_version: 2.5.1
227
224
  signing_key:
228
225
  specification_version: 4
229
226
  summary: Extract HTML hierarchy (headings and content) into a list of items
@@ -1 +0,0 @@
1
- service_name: travis-ci
data/.document DELETED
@@ -1,5 +0,0 @@
1
- lib/**/*.rb
2
- bin/*
3
- -
4
- features/**/*.feature
5
- LICENSE.txt
data/.rspec DELETED
@@ -1,2 +0,0 @@
1
- --color
2
- --format progress
@@ -1,26 +0,0 @@
1
- # Defaults:
2
- # https://github.com/bbatsov/rubocop/blob/master/config/default.yml
3
- Metrics/AbcSize:
4
- Max: 100
5
-
6
- Metrics/ClassLength:
7
- Max: 200
8
-
9
- Metrics/ModuleLength:
10
- Max: 200
11
-
12
- Metrics/MethodLength:
13
- Max: 50
14
-
15
- Metrics/CyclomaticComplexity:
16
- Max: 10
17
-
18
- Metrics/PerceivedComplexity:
19
- Max: 10
20
-
21
- Style/FileName:
22
- Enabled: false
23
-
24
- Style/MultilineOperationIndentation:
25
- Enabled: false
26
-
@@ -1,12 +0,0 @@
1
- language: ruby
2
- cache: bundler
3
- before_script: bundle update
4
- script: ./scripts/test_ci
5
- rvm:
6
- - 2.2
7
- - 2.1
8
- - 2.0
9
- notifications:
10
- email:
11
- on_success: never
12
- on_failure: never
@@ -1,53 +0,0 @@
1
- Hi collaborator!
2
-
3
- If you have a fix or a new feature, please start by checking in the
4
- [issues](https://github.com/pixelastic/html-hierarchy-extractor/issues) if it is
5
- already referenced. If not, feel free to open one.
6
-
7
- We use [pull requests](https://github.com/pixelastic/html-hierarchy-extractor/pulls)
8
- for collaboration. The workflow is as follow:
9
-
10
- - Create a local branch, starting from `develop`
11
- - Submit the PR on `develop`
12
- - Wait for review
13
- - Do the changes requested (if any)
14
- - We may ask you to rebase the branch to latest `develop` if it gets out of sync
15
- - Get praise for your awesome contribution
16
-
17
- # Development workflow
18
-
19
- Run `bundle install` to get all dependencies up to date.
20
-
21
- You can then launch:
22
-
23
- - `./scripts/test` to launch tests
24
- - `./scripts/watch` to start a test watcher (for TDD) using Guard
25
-
26
- If you plan on submitting a PR, I suggest you install the git hooks. This will
27
- run pre-commit and pre-push checks. Those checks will also be run by TravisCI,
28
- but running them locally gives faster feedback.
29
-
30
- If you want to a local version of the gem in your local project, I suggest
31
- updating your project `Gemfile` to point to the correct local directory
32
-
33
- ```ruby
34
- gem "html-hierarchy-extractor", :path => "/path/to/local/gem/folder"
35
- ```
36
-
37
- You should also run `rake gemspec` from the `html-hierarchy-extractor`
38
- repository the first time and if you added/deleted any file or dependency.
39
-
40
- # Tagging and releasing
41
-
42
- This part is for main contributors:
43
-
44
- ```
45
- # Bump the version (in develop)
46
- ./scripts/bump_version minor
47
-
48
- # Update master and release
49
- ./scripts/release
50
-
51
- # Install the gem locally (optional)
52
- rake install
53
- ```
data/Gemfile DELETED
@@ -1,16 +0,0 @@
1
- source 'http://rubygems.org'
2
-
3
- gem 'awesome_print', '~> 1.6'
4
- gem 'json', '~> 1.8'
5
- gem 'nokogiri', '~> 1.6'
6
-
7
- group :development do
8
- gem 'coveralls', '~> 0.8'
9
- gem 'flay', '~> 2.6'
10
- gem 'flog', '~> 4.3'
11
- gem 'guard-rspec', '~> 4.6'
12
- gem 'jeweler', '~> 2.0'
13
- gem 'rspec', '~> 3.0'
14
- gem 'rubocop', '~> 0.31'
15
- gem 'simplecov', '~> 0.10'
16
- end
data/Guardfile DELETED
@@ -1,7 +0,0 @@
1
- guard :rspec, cmd: 'bundle exec rspec --color --format documentation' do
2
- watch(%r{^spec/.+_spec\.rb$})
3
- watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
4
- watch('spec/spec_helper.rb') { 'spec' }
5
- end
6
-
7
- notification :off
@@ -1,20 +0,0 @@
1
- Copyright (c) 2016 Pixelastic
2
-
3
- Permission is hereby granted, free of charge, to any person obtaining
4
- a copy of this software and associated documentation files (the
5
- "Software"), to deal in the Software without restriction, including
6
- without limitation the rights to use, copy, modify, merge, publish,
7
- distribute, sublicense, and/or sell copies of the Software, and to
8
- permit persons to whom the Software is furnished to do so, subject to
9
- the following conditions:
10
-
11
- The above copyright notice and this permission notice shall be
12
- included in all copies or substantial portions of the Software.
13
-
14
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md DELETED
@@ -1,141 +0,0 @@
1
- # html-hierarchy-extractor
2
-
3
- This gems lets you extract the hierarchy of headings and content from any HTML
4
- page into an array of elements.
5
-
6
- Intended to be used with [Algolia][1] to improve relevance of search
7
- results inside large HTML pages. The records created are compatible with the
8
- [DocSearch][2] format.
9
-
10
- ## Installation
11
-
12
- ```ruby
13
- # Gemfile
14
- source 'http://rubygems.org'
15
-
16
- gem 'html-hierarchy-extractor', '~> 1.0'
17
- ```
18
-
19
- ## How to use
20
-
21
- ```ruby
22
- require 'html-hierarchy-extractor'
23
-
24
- content = File.read('./index.html')
25
- page = HTMLHierarchyExtractor.new(content)
26
- records = page.extract
27
- puts records
28
- ```
29
-
30
- ## Records
31
-
32
- `extract` will return an array of records. Each record will represent a `<p>`
33
- paragraph of the initial text, along with it textual version (HTML removed),
34
- heading hierarchy, and other interesting bits.
35
-
36
- ## Example
37
-
38
- Let's take the following HTML as input and see what records we got as output:
39
-
40
- ```html
41
- <!doctype html>
42
- <html>
43
- <body>
44
- <h1 name="journey">The Hero's Journey</h1>
45
- <p>Most stories always follow the same pattern.</p>
46
- <h2 name="departure">Part One: Departure</h2>
47
- <p>A story starts in a mundane world, and helps identify the hero. It helps puts all the achievements of the story into perspective.</p>
48
- <h3 name="calladventure">The call to Adventure</h3>
49
- <p>Some out-of-the-ordinary event pushes the hero to start his journey.</p>
50
- <h3 name="threshold">Crossing the Threshold</h3>
51
- <p>The hero quits his job, hit the road, or whatever cuts him from his previous life.</p>
52
- <h2 name="initiations">Part Two: Initiation</h2>
53
- <h3 name="trials">The Road of Trials</h3>
54
- <p>The road is filled with dangers. The hero as to find his inner strength to overcome them.</p>
55
- <h3 name="ultimate">The Ultimate Boon</h3>
56
- <p>The hero has found something, either physical or metaphorical that changes him.</p>
57
- <h2 name="return">Part Three: Return</h2>
58
- <h3 name="refusal">Refusal to Return</h3>
59
- <p>The hero does not want to go back to his previous life at first. But then, an event will make him change his mind.</p>
60
- <h3 name="master">Master of Two Worlds</h3>
61
- <p>Armed with his new power/weapon, the hero can go back to its initial world and fix all the issues he had there.</p>
62
- </body>
63
- </html>
64
- ```
65
-
66
- Here is one of the records extracted:
67
-
68
- ```ruby
69
- {
70
- :uuid => "1f5923d5a60e998704f201bbe9964811",
71
- :tag_name => "p",
72
- :html => "<p>The hero quit his jobs, hit the road, or whatever cuts him from his previous life.</p>",
73
- :text => "The hero quit his jobs, hit the road, or whatever cuts him from his previous life.",
74
- :node => #<Nokogiri::XML::Element:0x11a5850 name="p">,
75
- :anchor => nil,
76
- :hierarchy => {
77
- :lvl0 => "The Hero's Journey",
78
- :lvl1 => "Part One: Departure",
79
- :lvl2 => "Crossing the Threshold",
80
- :lvl3 => nil,
81
- :lvl4 => nil,
82
- :lvl5 => nil,
83
- :lvl6 => nil
84
- },
85
- :weight => {
86
- :heading => 70,
87
- :position => 3
88
- }
89
- }
90
- ```
91
-
92
- Each record has a `uuid` that uniquely identify it (computed by a hash of all
93
- the other values).
94
-
95
- It also contains the HTML tag name in `tag_name` (by default `<p>`
96
- paragraphs are extracted, but see the [settings][3] on how to change it).
97
-
98
- `html` contains the whole `outerContent` of the element, including the wrapping
99
- tags and inner children. The `text` attribute contains the textual content,
100
- stripping out all HTML.
101
-
102
- `node` contains the [Nokogiri node][4] instance. The lib uses it internally to
103
- extract all the relevant information ut is also exposed if you want to process
104
- the node further.
105
-
106
- The `anchor` attributes contains the HTML anchor closest to the element. Here it
107
- is `threshold` because this is the closest anchor in the hierarchy above.
108
- Anchors are searched in `name` and `id` attributes of headings.
109
-
110
- `hierarchy` then contains a snapshot of the current heading hierarchy of the
111
- paragraph. The `lvlX` syntax is used to be compatible with the records
112
- [DocSearch][5] is using.
113
-
114
- The `weight` attribute is used to provide an easy way to rank two records
115
- relative to each other.
116
-
117
- - `heading` gives the depth level in the hierarchy where the record is. Records
118
- on top level will have a value of 100, those under a `h1` will have 90, and so
119
- on. Because our record is under a `h3`, it has 70.
120
- - `position` is the position of the paragraph in the page. Here our paragraph is
121
- the fourth paragraph of the page, so it will have a `position` of 3. It can
122
- help you give more weight to the first items in the page.
123
-
124
- ## Settings
125
-
126
- When instanciating `HTMLHierarchyExtractor`, you can pass a secondary `options`
127
- argument. This attribute accepts one value, `css_selector`.
128
-
129
- ```ruby
130
- page = HTMLHierarchyExtractor.new(content, { css_selector: 'p,li' })
131
- ```
132
-
133
- This lets you change the default selector. Here instead of `<p>` paragraph,
134
- the library will extract `<li>` list elements as well.
135
-
136
-
137
- [1]: https://www.algolia.com/
138
- [2]: https://community.algolia.com/docsearch/
139
- [3]: #Settings
140
- [4]: http://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/Node
141
- [5]: https://community.algolia.com/docsearch/