boilerpipe-ruby 0.4.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.circleci/config.yml +6 -24
- data/.dockerignore +7 -0
- data/CHANGELOG.md +13 -0
- data/Dockerfile +14 -0
- data/README.md +12 -3
- data/Rakefile +3 -4
- data/bin/console +3 -3
- data/boilerpipe-ruby.gemspec +6 -6
- data/boilerpipe_flow.md +40 -0
- data/lib/boilerpipe.rb +3 -0
- data/lib/boilerpipe/document/text_block.rb +10 -12
- data/lib/boilerpipe/document/text_document.rb +4 -3
- data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
- data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
- data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
- data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
- data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
- data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
- data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
- data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
- data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
- data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
- data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
- data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
- data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
- data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
- data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
- data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
- data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
- data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
- data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
- data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
- data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
- data/lib/boilerpipe/labels/label_action.rb +1 -1
- data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +1 -5
- data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
- data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
- data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
- data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
- data/lib/boilerpipe/version.rb +1 -1
- metadata +16 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 65756911038bd486a08337188c3275ebd0c0c65e4b902edbd6b6667dda422740
|
4
|
+
data.tar.gz: ff83632d9cea8e4a0ede8609115e8282a856d2d7728c801e64ecec39a9399857
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e5e902c81cea26252c41bc4b96d0faebe6682a0dc5ae2c09397762ef4c5a7f244c0c100f87923863b308d2ef9b5ecc732e674d2ca801e4087f99031d46776034
|
7
|
+
data.tar.gz: 6788183a0a4c9d01c764d17537c52edaf9d32b93fb42da8a013f9f0b14f6a4f757a8d3b5f77b73fef59f62fcf988a723abf7fe8305f5626b0337838c4eb31c7d
|
data/.circleci/config.yml
CHANGED
@@ -6,48 +6,30 @@ version: 2
|
|
6
6
|
jobs:
|
7
7
|
build:
|
8
8
|
docker:
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
- image: circleci/ruby:2.5.5-node-browsers
|
10
|
+
|
12
11
|
# Specify service dependencies here if necessary
|
13
12
|
# CircleCI maintains a library of pre-built images
|
14
13
|
# documented at https://circleci.com/docs/2.0/circleci-images/
|
15
|
-
# - image: circleci/postgres:9.4
|
16
14
|
|
17
15
|
working_directory: ~/repo
|
18
16
|
|
19
17
|
steps:
|
20
18
|
- checkout
|
21
19
|
|
22
|
-
|
23
|
-
- restore_cache:
|
24
|
-
keys:
|
25
|
-
- v1-dependencies-{{ checksum "Gemfile.lock" }}
|
26
|
-
# fallback to using the latest cache if no exact match is found
|
27
|
-
- v1-dependencies-
|
28
|
-
|
20
|
+
- run: gem install bundler
|
29
21
|
- run:
|
30
22
|
name: install dependencies
|
31
23
|
command: |
|
32
|
-
bundle install --jobs=4 --retry=3
|
24
|
+
bundle install --jobs=4 --retry=3
|
33
25
|
|
34
|
-
- save_cache:
|
35
|
-
paths:
|
36
|
-
- ./vendor/bundle
|
37
|
-
key: v1-dependencies-{{ checksum "Gemfile.lock" }}
|
38
|
-
|
39
|
-
# Database setup
|
40
|
-
#- run: bundle exec rake db:create
|
41
|
-
#- run: bundle exec rake db:schema:load
|
42
|
-
|
43
|
-
# run tests!
|
44
26
|
- run:
|
45
27
|
name: run tests
|
46
28
|
command: |
|
47
29
|
mkdir /tmp/test-results
|
48
30
|
TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb")"
|
49
|
-
|
50
|
-
rspec --format progress "spec"
|
31
|
+
|
32
|
+
bundle exec rspec --format progress "spec"
|
51
33
|
|
52
34
|
# collect reports
|
53
35
|
- store_test_results:
|
data/.dockerignore
ADDED
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,16 @@
|
|
1
|
+
# 0.4.1 / 2019-07-04
|
2
|
+
|
3
|
+
* Fix bug in min_clause_words_filter ( used in article_sentence_extractor )
|
4
|
+
* Allow tests to run in Docker
|
5
|
+
* Update circle to continue to work
|
6
|
+
* Add architecture flow
|
7
|
+
* Code formatting
|
8
|
+
* Add min words filter specs
|
9
|
+
* Add label action specs
|
10
|
+
* Add missing test case to ignorable element spec
|
11
|
+
* Add merge_next case to text block spec
|
12
|
+
* Dry up includes
|
13
|
+
|
1
14
|
# 0.4.0 / 2017-09-15
|
2
15
|
|
3
16
|
* Add KeepEverythingWithMinKWords Extractor
|
data/Dockerfile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
From ruby:2.5
|
2
|
+
RUN gem install bundler
|
3
|
+
COPY *gemspec /usr/src/app/
|
4
|
+
COPY Gemfile /usr/src/app/
|
5
|
+
COPY lib/boilerpipe/version.rb /usr/src/app/lib/boilerpipe/
|
6
|
+
COPY bin /usr/src/app/
|
7
|
+
COPY bin/* /usr/src/app/bin/
|
8
|
+
|
9
|
+
WORKDIR /usr/src/app
|
10
|
+
RUN bin/setup
|
11
|
+
|
12
|
+
COPY . /usr/src/app/
|
13
|
+
|
14
|
+
CMD ["bundle", "exec", "rspec", "--color", "--format", "doc"]
|
data/README.md
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
# Boilerpipe
|
2
2
|
|
3
|
+
[![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
|
4
|
+
[![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
|
5
|
+
|
3
6
|
A pure ruby implemenation of the boilerpipe algorithm.
|
4
7
|
|
5
8
|
This is a text extraction utility first written by Christian Kohlshutter - [presentation](http://videolectures.net/wsdm2010_kohlschutter_bdu/)
|
@@ -10,6 +13,8 @@ I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-w
|
|
10
13
|
|
11
14
|
This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
|
12
15
|
|
16
|
+
Here's a high level [diagram](boilerpipe_flow.md) of how the system works.
|
17
|
+
|
13
18
|
# TLDR
|
14
19
|
|
15
20
|
Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor - try out the others when you feel like experimenting...
|
@@ -24,9 +29,6 @@ Presently the follow Extractors are implemented
|
|
24
29
|
* [x] LargestContentExtractor
|
25
30
|
* [x] NumWordsRulesExtractor
|
26
31
|
|
27
|
-
[![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
|
28
|
-
|
29
|
-
[![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
|
30
32
|
|
31
33
|
## Installation
|
32
34
|
|
@@ -71,6 +73,13 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
71
73
|
|
72
74
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
73
75
|
|
76
|
+
### Running Tests on Docker
|
77
|
+
|
78
|
+
The default run command will run the tests
|
79
|
+
|
80
|
+
docker build -t boilerpipe .
|
81
|
+
docker run -it --rm boilerpipe
|
82
|
+
|
74
83
|
## Contributing
|
75
84
|
|
76
85
|
Bug reports and pull requests are welcome on GitHub at https://github.com/gregors/boilerpipe-ruby.
|
data/Rakefile
CHANGED
@@ -1,14 +1,13 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'rspec/core/rake_task'
|
3
3
|
|
4
4
|
RSpec::Core::RakeTask.new(:spec)
|
5
5
|
|
6
6
|
task :default => :spec
|
7
7
|
|
8
|
-
|
9
8
|
desc 'Downloads forked boilerpipe jar from Gregors github for sanity checks'
|
10
9
|
task :download_boilerpipe_jar do
|
11
10
|
FileUtils.mkdir_p 'spec/sanity_checks/jars/'
|
12
11
|
Dir.chdir 'spec/sanity_checks/jars/'
|
13
|
-
|
12
|
+
`wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
|
14
13
|
end
|
data/bin/console
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require
|
4
|
-
require
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'boilerpipe'
|
5
5
|
|
6
6
|
# You can add fixtures and/or initialization code here to make experimenting
|
7
7
|
# with your gem easier. You can also use a different console, if you like.
|
@@ -10,5 +10,5 @@ require "boilerpipe"
|
|
10
10
|
# require "pry"
|
11
11
|
# Pry.start
|
12
12
|
|
13
|
-
require
|
13
|
+
require 'irb'
|
14
14
|
IRB.start
|
data/boilerpipe-ruby.gemspec
CHANGED
@@ -10,18 +10,18 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ['<gregory.ostermayr@gmail.com>']
|
11
11
|
spec.license = 'Apache 2.0'
|
12
12
|
|
13
|
-
spec.summary = %q{A pure ruby
|
14
|
-
spec.description = %q{A pure ruby implementation of the boilerpipe algorithm}
|
13
|
+
spec.summary = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
|
14
|
+
spec.description = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
|
15
15
|
spec.homepage = 'https://github.com/gregors/boilerpipe-ruby'
|
16
16
|
|
17
17
|
spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
18
|
-
spec.bindir =
|
18
|
+
spec.bindir = 'exe'
|
19
19
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
20
|
-
spec.require_paths = [
|
20
|
+
spec.require_paths = ['lib']
|
21
21
|
|
22
|
-
spec.add_development_dependency 'bundler', '~>
|
22
|
+
spec.add_development_dependency 'bundler', '~> 2.0'
|
23
23
|
spec.add_development_dependency 'rake', '~> 10.0'
|
24
|
-
spec.add_development_dependency 'rspec', '~> 3.0'
|
25
24
|
spec.add_development_dependency 'rickshaw', '~> 0.4.0'
|
25
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
26
26
|
spec.add_runtime_dependency 'nokogiri', '>= 1.6.6.2'
|
27
27
|
end
|
data/boilerpipe_flow.md
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
```
|
2
|
+
raw html
|
3
|
+
|
|
4
|
+
|
|
5
|
+
sax input -> sax parser(html parser) -> HTML Content handler -> tokenizer ---------
|
6
|
+
|
|
7
|
+
-------------------------------------<------------------------------------<------|
|
8
|
+
| | |
|
9
|
+
text blocks text blocks text blocks
|
10
|
+
| | |
|
11
|
+
| | |
|
12
|
+
-----------------------------
|
13
|
+
|
|
14
|
+
|
|
15
|
+
text document
|
16
|
+
|
|
17
|
+
|
|
18
|
+
filter
|
19
|
+
|
|
20
|
+
filter
|
21
|
+
|
|
22
|
+
filter
|
23
|
+
|
|
24
|
+
filter
|
25
|
+
|
|
26
|
+
filter
|
27
|
+
|
|
28
|
+
filter
|
29
|
+
|
|
30
|
+
filter
|
31
|
+
|
|
32
|
+
filter
|
33
|
+
|
|
34
|
+
filter
|
35
|
+
|
|
36
|
+
|
|
37
|
+
text document
|
38
|
+
|
|
39
|
+
outputs extracted text
|
40
|
+
```
|
data/lib/boilerpipe.rb
CHANGED
@@ -1,10 +1,7 @@
|
|
1
|
-
require 'set'
|
2
|
-
|
3
1
|
module Boilerpipe
|
4
2
|
module Document
|
5
3
|
class TextBlock
|
6
|
-
|
7
|
-
#EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
|
4
|
+
# EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
|
8
5
|
|
9
6
|
attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
|
10
7
|
:num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
|
@@ -12,7 +9,7 @@ module Boilerpipe
|
|
12
9
|
|
13
10
|
attr_accessor :content
|
14
11
|
|
15
|
-
def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
|
12
|
+
def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0)
|
16
13
|
@labels = Set.new
|
17
14
|
@text = text
|
18
15
|
@num_words = num_words
|
@@ -32,9 +29,9 @@ module Boilerpipe
|
|
32
29
|
new('', 0, 0, 0, 0, -1)
|
33
30
|
end
|
34
31
|
|
35
|
-
|
36
|
-
|
37
|
-
|
32
|
+
def set_tag_level(level)
|
33
|
+
@tag_level = level
|
34
|
+
end
|
38
35
|
|
39
36
|
def is_content?
|
40
37
|
@content
|
@@ -68,8 +65,8 @@ module Boilerpipe
|
|
68
65
|
@num_words_in_anchor_text += other.num_words_in_anchor_text
|
69
66
|
@num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
|
70
67
|
@num_wrapped_lines += other.num_wrapped_lines
|
71
|
-
@offset_blocks_start = [@offset_blocks_start
|
72
|
-
@offset_blocks_end = [@offset_blocks_end
|
68
|
+
@offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min
|
69
|
+
@offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max
|
73
70
|
init_densities
|
74
71
|
@content |= other.is_content?
|
75
72
|
|
@@ -87,10 +84,10 @@ module Boilerpipe
|
|
87
84
|
end
|
88
85
|
|
89
86
|
def to_s
|
90
|
-
#"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
|
87
|
+
# "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
|
91
88
|
labels = 'null'
|
92
89
|
if !@labels.empty?
|
93
|
-
labels ="[#{
|
90
|
+
labels = "[#{@labels.to_a.join(',')}]"
|
94
91
|
end
|
95
92
|
"[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
|
96
93
|
end
|
@@ -100,6 +97,7 @@ module Boilerpipe
|
|
100
97
|
end
|
101
98
|
|
102
99
|
private
|
100
|
+
|
103
101
|
def init_densities
|
104
102
|
if @num_words_in_wrapped_lines == 0
|
105
103
|
@num_words_in_wrapped_lines = @num_words
|
@@ -19,12 +19,14 @@ module Boilerpipe
|
|
19
19
|
case text_block.is_content?
|
20
20
|
when true
|
21
21
|
next unless include_content
|
22
|
+
|
22
23
|
s << text_block.text
|
23
24
|
s << "\n"
|
24
25
|
when false
|
25
26
|
next unless include_noncontent
|
26
|
-
|
27
|
-
|
27
|
+
|
28
|
+
s << text_block.text
|
29
|
+
s << "\n"
|
28
30
|
end
|
29
31
|
end
|
30
32
|
s
|
@@ -38,7 +40,6 @@ module Boilerpipe
|
|
38
40
|
@text_blocks.map(&:to_s).join("\n")
|
39
41
|
end
|
40
42
|
alias_method :debug_string, :debug_s
|
41
|
-
|
42
43
|
end
|
43
44
|
end
|
44
45
|
end
|
@@ -1,11 +1,8 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
# probably makes sense only in cases where an upstream filter already has removed some blocks.
|
1
|
+
# Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
|
2
|
+
# probably makes sense only in cases where an upstream filter already has removed some blocks.
|
4
3
|
|
5
4
|
module Boilerpipe::Filters
|
6
5
|
class BlockProximityFusion
|
7
|
-
|
8
|
-
|
9
6
|
def initialize(max_blocks_distance, content_only, same_tag_level_only)
|
10
7
|
@max_blocks_distance = max_blocks_distance
|
11
8
|
@content_only = content_only
|
@@ -13,8 +10,8 @@ module Boilerpipe::Filters
|
|
13
10
|
end
|
14
11
|
|
15
12
|
MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
|
16
|
-
MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(
|
17
|
-
MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(
|
13
|
+
MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(1, false, true)
|
14
|
+
MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(1, true, false)
|
18
15
|
MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
|
19
16
|
|
20
17
|
def process(doc)
|
@@ -22,7 +19,7 @@ module Boilerpipe::Filters
|
|
22
19
|
return false if text_blocks.size < 2
|
23
20
|
|
24
21
|
prev_block = if @content_only
|
25
|
-
text_blocks.find{ |tb| tb.is_content? }
|
22
|
+
text_blocks.find { |tb| tb.is_content? }
|
26
23
|
else
|
27
24
|
text_blocks.first
|
28
25
|
end
|
@@ -46,18 +43,16 @@ module Boilerpipe::Filters
|
|
46
43
|
ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
|
47
44
|
ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
|
48
45
|
|
49
|
-
if
|
46
|
+
if ok
|
50
47
|
prev_block.merge_next(tb)
|
51
48
|
blocks_to_remove << tb
|
52
49
|
else
|
53
50
|
prev_block = tb
|
54
51
|
end
|
55
52
|
end
|
56
|
-
|
57
53
|
end
|
58
|
-
doc.replace_text_blocks!(
|
54
|
+
doc.replace_text_blocks!(text_blocks - blocks_to_remove)
|
59
55
|
doc
|
60
56
|
end
|
61
|
-
|
62
57
|
end
|
63
58
|
end
|
@@ -1,9 +1,7 @@
|
|
1
|
-
|
2
|
-
# Removes TextBlocks which have explicitly been marked as "not content".
|
1
|
+
# Removes TextBlocks which have explicitly been marked as "not content".
|
3
2
|
|
4
3
|
module Boilerpipe::Filters
|
5
4
|
class BoilerplateBlockFilter
|
6
|
-
|
7
5
|
def initialize(label)
|
8
6
|
@label_to_keep = label
|
9
7
|
end
|
@@ -21,6 +19,5 @@ module Boilerpipe::Filters
|
|
21
19
|
doc.replace_text_blocks!(combined)
|
22
20
|
doc
|
23
21
|
end
|
24
|
-
|
25
22
|
end
|
26
23
|
end
|
@@ -1,10 +1,9 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
1
|
+
# A full-text extractor trained on http://krdwrd.org/
|
2
|
+
# https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
|
3
|
+
# Works well with SimpleEstimator, too.
|
4
4
|
|
5
5
|
module Boilerpipe::Filters
|
6
6
|
class CanolaClassifier
|
7
|
-
|
8
7
|
def self.process(doc)
|
9
8
|
return doc if doc.text_blocks.size < 1
|
10
9
|
|
@@ -22,7 +21,7 @@ module Boilerpipe::Filters
|
|
22
21
|
def self.classify(prev, current, nxt)
|
23
22
|
current.link_density > 0 && nxt.num_words > 11 \
|
24
23
|
|| current.num_words > 19 \
|
25
|
-
|| nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (
|
24
|
+
|| nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
|
26
25
|
end
|
27
26
|
end
|
28
27
|
end
|
@@ -5,9 +5,8 @@
|
|
5
5
|
|
6
6
|
module Boilerpipe::Filters
|
7
7
|
class DensityRulesClassifier
|
8
|
-
|
9
8
|
def self.process(doc)
|
10
|
-
#return doc if doc.text_blocks.size < 2
|
9
|
+
# return doc if doc.text_blocks.size < 2
|
11
10
|
|
12
11
|
empty = Boilerpipe::Document::TextBlock.empty_start
|
13
12
|
text_blocks = [empty] + doc.text_blocks + [empty]
|
@@ -26,12 +25,14 @@ module Boilerpipe::Filters
|
|
26
25
|
if prev.link_density <= 0.555556
|
27
26
|
if current.text_density <= 9
|
28
27
|
return true if nxt.text_density > 10
|
28
|
+
|
29
29
|
return prev.text_density <= 4 ? false : true
|
30
30
|
else
|
31
31
|
return nxt.text_density == 0 ? false : true
|
32
32
|
end
|
33
33
|
else
|
34
34
|
return false if nxt.text_density <= 11
|
35
|
+
|
35
36
|
true
|
36
37
|
end
|
37
38
|
end
|
@@ -1,12 +1,9 @@
|
|
1
|
-
#
|
2
|
-
|
1
|
+
# Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
|
2
|
+
# some heuristics which are quite specific to the news domain.
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
# we create a list of potential titles from the page title
|
8
|
-
# then we look at every text block and if the text block
|
9
|
-
# contains a potential title - we set that text block label as :TITLE
|
4
|
+
# we create a list of potential titles from the page title
|
5
|
+
# then we look at every text block and if the text block
|
6
|
+
# contains a potential title - we set that text block label as :TITLE
|
10
7
|
|
11
8
|
module Boilerpipe::Filters
|
12
9
|
class DocumentTitleMatchClassifier
|
@@ -55,24 +52,24 @@ module Boilerpipe::Filters
|
|
55
52
|
@potential_titles << title
|
56
53
|
|
57
54
|
# unnecessary
|
58
|
-
#p = longest_part(title, /[ ]*[|»-][ ]*/)
|
59
|
-
|
55
|
+
# p = longest_part(title, /[ ]*[|»-][ ]*/)
|
56
|
+
# @potential_titles << p if p
|
60
57
|
|
61
|
-
#p = longest_part(title, /[ ]*[|»:][ ]*/)
|
62
|
-
|
58
|
+
# p = longest_part(title, /[ ]*[|»:][ ]*/)
|
59
|
+
# @potential_titles << p if p
|
63
60
|
|
64
|
-
#p = longest_part(title, /[ ]*[|»:()][ ]*/)
|
65
|
-
|
61
|
+
# p = longest_part(title, /[ ]*[|»:()][ ]*/)
|
62
|
+
# @potential_titles << p if p
|
66
63
|
|
67
|
-
#p = longest_part(title, /[ ]*[|»:()-][ ]*/)
|
68
|
-
|
64
|
+
# p = longest_part(title, /[ ]*[|»:()-][ ]*/)
|
65
|
+
# @potential_titles << p if p
|
69
66
|
|
70
67
|
p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
|
71
68
|
@potential_titles << p if p
|
72
69
|
|
73
70
|
# we replace \u00a0 so why check for it?
|
74
|
-
#p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
|
75
|
-
|
71
|
+
# p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
|
72
|
+
# @potential_titles << p if p
|
76
73
|
|
77
74
|
add_potential_titles(title, /[ ]+[|][ ]+/, 4)
|
78
75
|
add_potential_titles(title, /[ ]+[-][ ]+/, 4)
|
@@ -90,6 +87,7 @@ module Boilerpipe::Filters
|
|
90
87
|
|
91
88
|
parts.each do |part|
|
92
89
|
next if part =~ /[.]com/
|
90
|
+
|
93
91
|
num_words = number_of_words(part)
|
94
92
|
|
95
93
|
if num_words > longest_num_words || part.size > longest_part.size
|
@@ -107,6 +105,7 @@ module Boilerpipe::Filters
|
|
107
105
|
|
108
106
|
parts.each do |part|
|
109
107
|
next if part =~ /[.]com/
|
108
|
+
|
110
109
|
num_words = number_of_words(part)
|
111
110
|
|
112
111
|
@potential_titles << part if num_words >= min_words
|
@@ -116,6 +115,5 @@ module Boilerpipe::Filters
|
|
116
115
|
def number_of_words(s)
|
117
116
|
s.split(/[\b ]+/).size
|
118
117
|
end
|
119
|
-
|
120
118
|
end
|
121
119
|
end
|
@@ -1,10 +1,8 @@
|
|
1
|
-
|
2
1
|
# Marks all TextBlocks "content" which are between the headline and the part that has
|
3
2
|
# already been marked content, if they are marked MIGHT_BE_CONTENT.
|
4
3
|
# This filter is quite specific to the news domain.
|
5
4
|
# used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
|
6
5
|
|
7
|
-
|
8
6
|
module Boilerpipe::Filters
|
9
7
|
class ExpandTitleToContentFilter
|
10
8
|
def self.process(doc)
|
@@ -38,6 +36,5 @@ module Boilerpipe::Filters
|
|
38
36
|
def self.no_title_with_subsequent_content?(content_start, title)
|
39
37
|
title.nil? || content_start.nil? || content_start <= title
|
40
38
|
end
|
41
|
-
|
42
39
|
end
|
43
40
|
end
|
@@ -1,12 +1,11 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
1
|
+
# Marks all blocks as "non-content" that occur after blocks that have been
|
2
|
+
# marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
|
3
|
+
# number of words in content blocks occur before this mark (default: 60).
|
4
|
+
# This can be used in conjunction with an upstream TerminatingBlocksFinder.
|
5
5
|
|
6
6
|
module Boilerpipe::Filters
|
7
7
|
class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
|
8
|
-
|
9
|
-
def self.process(doc, min_num_words=60)
|
8
|
+
def self.process(doc, min_num_words = 60)
|
10
9
|
found_end_of_text = false
|
11
10
|
num_words = 0
|
12
11
|
|
@@ -19,6 +18,5 @@ module Boilerpipe::Filters
|
|
19
18
|
|
20
19
|
doc
|
21
20
|
end
|
22
|
-
|
23
21
|
end
|
24
22
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Keeps the largest TextBlock only (by the number of words). In case of
|
3
2
|
# more than one block with the same number of words, the first block is chosen.
|
4
3
|
# All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
|
@@ -8,7 +7,6 @@
|
|
8
7
|
|
9
8
|
module Boilerpipe::Filters
|
10
9
|
class KeepLargestBlockFilter
|
11
|
-
|
12
10
|
def initialize(expand_to_same_level_text, min_words)
|
13
11
|
@expand_to_same_level_text = expand_to_same_level_text
|
14
12
|
@min_words = min_words
|
@@ -43,7 +41,7 @@ module Boilerpipe::Filters
|
|
43
41
|
expand_tag_level(tbs[0...n].reverse, level, @min_words)
|
44
42
|
|
45
43
|
# expand blocks to the right
|
46
|
-
expand_tag_level(tbs[n+1..-1], level, @min_words)
|
44
|
+
expand_tag_level(tbs[n + 1..-1], level, @min_words)
|
47
45
|
end
|
48
46
|
end
|
49
47
|
|
@@ -57,6 +55,5 @@ module Boilerpipe::Filters
|
|
57
55
|
end
|
58
56
|
end
|
59
57
|
end
|
60
|
-
|
61
58
|
end
|
62
59
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Marks all blocks as content that:
|
3
2
|
# are on the same tag-level as very likely main content
|
4
3
|
# (usually the level of the largest block)
|
@@ -7,23 +6,22 @@
|
|
7
6
|
|
8
7
|
module Boilerpipe::Filters
|
9
8
|
class LargeBlockSameTagLevelToContentFilter
|
10
|
-
|
11
9
|
def self.process(doc)
|
12
|
-
|
13
10
|
largest = doc.text_blocks.find do |tb|
|
14
11
|
tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
|
15
12
|
end
|
16
13
|
|
17
14
|
return doc if largest.nil?
|
15
|
+
|
18
16
|
tag_level = largest.tag_level
|
19
17
|
|
20
18
|
doc.text_blocks.each do |tb|
|
21
19
|
next if tb.is_content?
|
20
|
+
|
22
21
|
tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
|
23
22
|
end
|
24
23
|
|
25
24
|
doc
|
26
25
|
end
|
27
|
-
|
28
26
|
end
|
29
27
|
end
|
@@ -11,7 +11,7 @@ module Boilerpipe::Filters
|
|
11
11
|
doc.text_blocks.each do |tb|
|
12
12
|
if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
|
13
13
|
tag_level = tb.tag_level
|
14
|
-
elsif
|
14
|
+
elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
|
15
15
|
tb.content = true
|
16
16
|
else
|
17
17
|
tag_level = MAX
|
@@ -20,6 +20,5 @@ module Boilerpipe::Filters
|
|
20
20
|
|
21
21
|
doc
|
22
22
|
end
|
23
|
-
|
24
23
|
end
|
25
24
|
end
|
@@ -1,14 +1,12 @@
|
|
1
|
-
|
1
|
+
# Marks all blocks as content.
|
2
2
|
|
3
3
|
module Boilerpipe::Filters
|
4
4
|
class MarkEverythingContentFilter
|
5
|
-
|
6
5
|
def self.process(doc)
|
7
6
|
doc.text_blocks.each do |tb|
|
8
7
|
tb.content = true if tb.is_not_content?
|
9
8
|
end
|
10
9
|
doc
|
11
10
|
end
|
12
|
-
|
13
11
|
end
|
14
12
|
end
|
@@ -8,30 +8,27 @@
|
|
8
8
|
|
9
9
|
module Boilerpipe::Filters
|
10
10
|
class MinClauseWordsFilter
|
11
|
-
|
12
|
-
def self.process(doc, min_words=5)
|
13
|
-
|
11
|
+
def self.process(doc, min_words = 5)
|
14
12
|
doc.text_blocks.each do |tb|
|
15
13
|
next if tb.is_not_content?
|
16
14
|
|
17
15
|
clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
|
16
|
+
hasClause = false
|
18
17
|
tb.text.scan(clause_delimiter).each do |possible_clause|
|
19
|
-
|
20
|
-
break
|
21
|
-
else
|
22
|
-
tb.content = false
|
23
|
-
end
|
18
|
+
hasClause |= is_clause? possible_clause
|
24
19
|
end
|
20
|
+
|
21
|
+
tb.content = false unless hasClause
|
25
22
|
end
|
26
23
|
|
27
24
|
doc
|
28
25
|
end
|
29
26
|
|
30
|
-
def self.is_clause?(text, min_words=5)
|
31
|
-
|
27
|
+
def self.is_clause?(text, min_words = 5)
|
28
|
+
return false if text.nil?
|
29
|
+
|
32
30
|
whitespace = /[ \n\r]+/
|
33
31
|
text.scan(whitespace).size >= min_words
|
34
32
|
end
|
35
|
-
|
36
33
|
end
|
37
34
|
end
|
@@ -1,16 +1,14 @@
|
|
1
|
-
|
2
1
|
# Keeps only those content blocks which contain at least k words.
|
3
2
|
|
4
3
|
module Boilerpipe::Filters
|
5
4
|
class MinWordsFilter
|
6
|
-
|
7
5
|
def self.process(min_words, doc)
|
8
6
|
doc.text_blocks.each do |tb|
|
9
7
|
next if tb.is_not_content?
|
8
|
+
|
10
9
|
tb.content = false if tb.num_words < min_words
|
11
10
|
end
|
12
11
|
doc
|
13
12
|
end
|
14
|
-
|
15
13
|
end
|
16
14
|
end
|
@@ -1,5 +1,3 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
# Classifies TextBlocks as content/not-content through rules that have been determined
|
4
2
|
# using the C4.8 machine learning algorithm, as described in the paper
|
5
3
|
# "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
|
@@ -7,7 +5,6 @@
|
|
7
5
|
|
8
6
|
module Boilerpipe::Filters
|
9
7
|
class NumWordsRulesClassifier
|
10
|
-
|
11
8
|
def self.process(doc)
|
12
9
|
empty = Boilerpipe::Document::TextBlock.empty_start
|
13
10
|
text_blocks = [empty] + doc.text_blocks + [empty]
|
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
|
|
37
34
|
|
38
35
|
false
|
39
36
|
end
|
40
|
-
|
41
37
|
end
|
42
38
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
# Merges two subsequent blocks if their text densities are equal.
|
2
2
|
|
3
3
|
module Boilerpipe::Filters
|
4
4
|
class SimpleBlockFusionProcessor
|
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
-
doc.replace_text_blocks!(
|
20
|
+
doc.replace_text_blocks!(tbs - blocks_to_remove)
|
21
21
|
doc
|
22
22
|
end
|
23
23
|
end
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Splits TextBlocks at paragraph boundaries.
|
3
2
|
#
|
4
3
|
# NOTE: This is not fully supported (i.e., it will break highlighting support via
|
@@ -8,7 +7,6 @@
|
|
8
7
|
|
9
8
|
module Boilerpipe::Filters
|
10
9
|
class SplitParagraphBlocksFilter
|
11
|
-
|
12
10
|
def self.process(doc)
|
13
11
|
tbs = doc.text_blocks
|
14
12
|
new_blocks = []
|
@@ -35,6 +33,5 @@ module Boilerpipe::Filters
|
|
35
33
|
doc.replace_text_blocks!(new_blocks) if changes
|
36
34
|
doc
|
37
35
|
end
|
38
|
-
|
39
36
|
end
|
40
37
|
end
|
@@ -1,15 +1,13 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
1
|
# Finds blocks which are potentially indicating the end of an article
|
4
2
|
# text and marks them with INDICATES_END_OF_TEXT. This can be used
|
5
3
|
# in conjunction with a downstream IgnoreBlocksAfterContentFilter.
|
6
4
|
|
7
|
-
|
8
5
|
module Boilerpipe::Filters
|
9
6
|
class TerminatingBlocksFinder
|
10
7
|
def self.process(doc)
|
11
8
|
doc.text_blocks.each do |tb|
|
12
9
|
next unless tb.num_words < 15
|
10
|
+
|
13
11
|
if tb.text.length >= 8 && finds_match?(tb.text.downcase)
|
14
12
|
tb.labels << :INDICATES_END_OF_TEXT
|
15
13
|
elsif tb.link_density == 1.0 && tb.text == 'comment'
|
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
|
|
29
27
|
text.include?('what you think...') ||
|
30
28
|
text.include?('add your comment') ||
|
31
29
|
text.include?('add comment') ||
|
32
|
-
#TODO add this and test
|
33
|
-
#text.include?('leave a reply') ||
|
34
|
-
#text.include?('leave a comment') ||
|
35
|
-
#text.include?('show comments') ||
|
36
|
-
#text.include?('Share this:') ||
|
30
|
+
# TODO add this and test
|
31
|
+
# text.include?('leave a reply') ||
|
32
|
+
# text.include?('leave a comment') ||
|
33
|
+
# text.include?('show comments') ||
|
34
|
+
# text.include?('Share this:') ||
|
37
35
|
text.include?('reader views') ||
|
38
36
|
text.include?('have your say') ||
|
39
37
|
text.include?('reader comments') ||
|
@@ -1,4 +1,3 @@
|
|
1
|
-
|
2
1
|
# Marks trailing headlines TextBlocks that have the label :#HEADING
|
3
2
|
# as boilerplate. Trailing means they are marked content and are
|
4
3
|
# below any other content block.
|
@@ -6,7 +5,6 @@
|
|
6
5
|
module Boilerpipe::Filters
|
7
6
|
class TrailingHeadlineToBoilerplateFilter
|
8
7
|
def self.process(doc)
|
9
|
-
|
10
8
|
doc.text_blocks.each do |tb|
|
11
9
|
next unless tb.is_content?
|
12
10
|
|
@@ -19,6 +17,5 @@ module Boilerpipe::Filters
|
|
19
17
|
|
20
18
|
doc
|
21
19
|
end
|
22
|
-
|
23
20
|
end
|
24
21
|
end
|
@@ -1,20 +1,16 @@
|
|
1
|
-
require 'nokogiri'
|
2
1
|
module Boilerpipe::SAX
|
3
2
|
class BoilerpipeHTMLParser
|
4
3
|
def self.parse(text)
|
5
|
-
|
6
|
-
#script bug - delete script tags
|
4
|
+
# script bug - delete script tags
|
7
5
|
text.gsub!(/\<script>.+?<\/script>/i, '')
|
8
6
|
|
9
7
|
# nokogiri uses libxml for mri and nekohtml for jruby
|
10
8
|
# mri doesn't remove when missing the semicolon
|
11
9
|
text.gsub!(/( ) /, '\1; ')
|
12
10
|
|
13
|
-
|
14
11
|
# use nokogiri to fix any bad tags, errors - keep experimenting with this
|
15
12
|
text = Nokogiri::HTML(text).to_html
|
16
13
|
|
17
|
-
|
18
14
|
handler = HTMLContentHandler.new
|
19
15
|
noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
|
20
16
|
noko_parser.parse(text)
|
@@ -1,11 +1,8 @@
|
|
1
|
-
require 'nokogiri'
|
2
|
-
require 'set'
|
3
|
-
|
4
1
|
module Boilerpipe::SAX
|
5
2
|
class HTMLContentHandler < Nokogiri::XML::SAX::Document
|
6
3
|
attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
|
7
4
|
|
8
|
-
attr_accessor :in_anchor_tag, :token_buffer
|
5
|
+
attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
|
9
6
|
ANCHOR_TEXT_START = "$\ue00a<"
|
10
7
|
ANCHOR_TEXT_END = ">\ue00a$"
|
11
8
|
|
@@ -34,7 +31,6 @@ module Boilerpipe::SAX
|
|
34
31
|
@label_stacks << nil
|
35
32
|
tag = name.upcase.intern
|
36
33
|
|
37
|
-
|
38
34
|
tag_action = @tag_actions[tag]
|
39
35
|
if tag_action
|
40
36
|
@tag_level += 1 if tag_action.changes_tag_level?
|
@@ -51,15 +47,15 @@ module Boilerpipe::SAX
|
|
51
47
|
def characters(text)
|
52
48
|
flush_block if @flush
|
53
49
|
|
54
|
-
return if
|
50
|
+
return if in_ignorable_element?
|
55
51
|
return if text.empty?
|
56
52
|
|
57
53
|
# replace all whitespace with simple space
|
58
54
|
text.gsub!(/\s+/, ' ')
|
59
55
|
|
60
56
|
# trim whitespace
|
61
|
-
started_with_whitespace = text
|
62
|
-
ended_with_whitespace = text
|
57
|
+
started_with_whitespace = text =~ /^\s/
|
58
|
+
ended_with_whitespace = text =~ /\s$/
|
63
59
|
text.strip!
|
64
60
|
|
65
61
|
# add a single space if the block was only whitespace
|
@@ -158,10 +154,10 @@ module Boilerpipe::SAX
|
|
158
154
|
end
|
159
155
|
|
160
156
|
text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
157
|
+
num_words,
|
158
|
+
num_linked_words,
|
159
|
+
num_words_in_wrapped_lines,
|
160
|
+
num_wrapped_lines, @offset_blocks)
|
165
161
|
|
166
162
|
@offset_blocks += 1
|
167
163
|
clear_buffers
|
@@ -187,10 +183,10 @@ module Boilerpipe::SAX
|
|
187
183
|
# \p{No} -- a numeric character of other type
|
188
184
|
|
189
185
|
def is_word?(word)
|
190
|
-
|
186
|
+
word =~ VALID_WORD_CHARACTER
|
191
187
|
end
|
192
188
|
|
193
|
-
#public void flushBlock() {
|
189
|
+
# public void flushBlock() {
|
194
190
|
# int numWords = 0;
|
195
191
|
# int numLinkedWords = 0;
|
196
192
|
# int numWrappedLines = 0;
|
@@ -198,12 +194,13 @@ module Boilerpipe::SAX
|
|
198
194
|
# final int maxLineLength = 80;
|
199
195
|
# int numTokens = 0;
|
200
196
|
# int numWordsCurrentLine = 0;
|
201
|
-
#}
|
197
|
+
# }
|
202
198
|
|
203
199
|
def increase_in_ignorable_element!
|
204
200
|
@in_ignorable_element += 1
|
205
201
|
end
|
206
202
|
|
203
|
+
# should we prevent less than zero here?
|
207
204
|
def decrease_in_ignorable_element!
|
208
205
|
@in_ignorable_element -= 1
|
209
206
|
end
|
@@ -224,7 +221,6 @@ module Boilerpipe::SAX
|
|
224
221
|
@in_anchor_tag > 0
|
225
222
|
end
|
226
223
|
|
227
|
-
|
228
224
|
def add_text_block(text_block)
|
229
225
|
@label_stacks.each do |stack|
|
230
226
|
next unless stack
|
@@ -239,6 +235,7 @@ module Boilerpipe::SAX
|
|
239
235
|
# append space if last character wasn't already one
|
240
236
|
def append_space
|
241
237
|
return if @sb_last_was_whitespace
|
238
|
+
|
242
239
|
@sb_last_was_whitespace = true
|
243
240
|
|
244
241
|
@text_buffer << ' '
|
@@ -2,7 +2,7 @@ module Boilerpipe::SAX::TagActions
|
|
2
2
|
class AnchorText
|
3
3
|
# Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
|
4
4
|
# There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
|
5
|
-
|
5
|
+
# * encounters such nestings, a SAXException is thrown.
|
6
6
|
def start(handler, name, attrs)
|
7
7
|
if handler.in_anchor_tag?
|
8
8
|
handler.in_anchor_tag += 1
|
@@ -42,7 +42,7 @@ module Boilerpipe::SAX::TagActions
|
|
42
42
|
# - dunno about nokogiri???????
|
43
43
|
# as nested A elements are not allowed per specification, we
|
44
44
|
# are probably reaching this branch due to a bug in the XML parser
|
45
|
-
#puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
|
45
|
+
# puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
|
46
46
|
end_tag(handler, name)
|
47
47
|
end
|
48
48
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
|
3
|
-
|
2
|
+
# Explicitly marks this tag a simple "block-level" element,
|
3
|
+
# which always generates whitespace
|
4
4
|
class BlockLevel
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
true
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
# for block-level elements, which triggers some LabelAction on
|
3
|
-
# the generated TextBlock.
|
2
|
+
# for block-level elements, which triggers some LabelAction on
|
3
|
+
# the generated TextBlock.
|
4
4
|
class BlockTagLabel
|
5
5
|
def initialize(label_action)
|
6
6
|
@label_action = label_action
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Boilerpipe::SAX::TagActions
|
2
|
-
|
3
|
-
|
2
|
+
# Marks this tag the body element (this should usually only
|
3
|
+
# be set for the <BODY> tag).
|
4
4
|
class Body
|
5
5
|
def start(handler, name, attrs)
|
6
6
|
handler.flush_block
|
@@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions
|
|
10
10
|
rel = m[1]
|
11
11
|
val = m[2].to_i # absolute
|
12
12
|
size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
|
13
|
-
handler.font_size_stack <<
|
13
|
+
handler.font_size_stack << size
|
14
14
|
else
|
15
15
|
handler.font_size_stack << nil
|
16
16
|
end
|
@@ -27,7 +27,7 @@ module Boilerpipe::SAX::TagActions
|
|
27
27
|
end
|
28
28
|
|
29
29
|
def relative(font_size_stack, rel, val)
|
30
|
-
prev_size = font_size_stack.reverse_each.find{|s| s
|
30
|
+
prev_size = font_size_stack.reverse_each.find { |s| !s.nil? }
|
31
31
|
prev_size = 3 if prev_size.nil?
|
32
32
|
|
33
33
|
size = if rel == '+'
|
data/lib/boilerpipe/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boilerpipe-ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Gregory Ostermayr
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-07-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -16,14 +16,14 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -39,33 +39,33 @@ dependencies:
|
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '10.0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
|
-
name:
|
42
|
+
name: rickshaw
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
45
|
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version:
|
47
|
+
version: 0.4.0
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
52
|
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version:
|
54
|
+
version: 0.4.0
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: rspec
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
61
|
+
version: '3.0'
|
62
62
|
type: :development
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version:
|
68
|
+
version: '3.0'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: nokogiri
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
@@ -80,7 +80,7 @@ dependencies:
|
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: 1.6.6.2
|
83
|
-
description: A pure ruby implementation of the boilerpipe algorithm
|
83
|
+
description: A pure ruby implementation of the boilerpipe web content extraction algorithm
|
84
84
|
email:
|
85
85
|
- "<gregory.ostermayr@gmail.com>"
|
86
86
|
executables: []
|
@@ -88,9 +88,11 @@ extensions: []
|
|
88
88
|
extra_rdoc_files: []
|
89
89
|
files:
|
90
90
|
- ".circleci/config.yml"
|
91
|
+
- ".dockerignore"
|
91
92
|
- ".gitignore"
|
92
93
|
- ".rspec"
|
93
94
|
- CHANGELOG.md
|
95
|
+
- Dockerfile
|
94
96
|
- Gemfile
|
95
97
|
- LICENSE.txt
|
96
98
|
- README.md
|
@@ -98,6 +100,7 @@ files:
|
|
98
100
|
- bin/console
|
99
101
|
- bin/setup
|
100
102
|
- boilerpipe-ruby.gemspec
|
103
|
+
- boilerpipe_flow.md
|
101
104
|
- lib/boilerpipe.rb
|
102
105
|
- lib/boilerpipe/document/text_block.rb
|
103
106
|
- lib/boilerpipe/document/text_document.rb
|
@@ -166,9 +169,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
166
169
|
- !ruby/object:Gem::Version
|
167
170
|
version: '0'
|
168
171
|
requirements: []
|
169
|
-
|
170
|
-
rubygems_version: 2.6.12
|
172
|
+
rubygems_version: 3.0.1
|
171
173
|
signing_key:
|
172
174
|
specification_version: 4
|
173
|
-
summary: A pure ruby
|
175
|
+
summary: A pure ruby implementation of the boilerpipe web content extraction algorithm
|
174
176
|
test_files: []
|