slasher 0.5.2 → 0.5.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -1
- data/Gemfile.lock +1 -1
- data/README.md +15 -4
- data/changelog.md +3 -0
- data/doc/website_coverage.txt +5 -0
- data/lib/slasher/content.rb +2 -2
- data/lib/slasher/dom.rb +6 -10
- data/lib/slasher.rb +7 -5
- data/slasher.gemspec +18 -0
- data/spec/slasher/content_spec.rb +3 -3
- data/spec/slasher/dom_spec.rb +1 -1
- data/spec/slasher_spec.rb +13 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7f0cfc1f144aae023283e6686c5c48299b25dda1
|
4
|
+
data.tar.gz: fb9f0d069941524e3b318614b63e0dc04174544e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d978c99ee6d4536997da088598f1720952b372a36bc678b99b751d72171c5c5b338e1ffb2b8938e249788bd217f97e2ada3b091f3699910c6357668e19094cc0
|
7
|
+
data.tar.gz: cd6ecbb87fe30349031255284ec7c35602146234b8aba6158865aa1c8d3c110d4cba418168f8caed26c80d007695408081c3e2f6d641d16979dd3cd05c0be37d
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -8,6 +8,19 @@ This project is actually the ruby version of [slasherjs](https://github.com/hafi
|
|
8
8
|
The result of extraction is depending of assumption on HTML document structure itself. Therefore, there may be flaws in the result if the document doesn't match the structure that is recognised by the library.
|
9
9
|
This condition will make the library will be improved from time to time.
|
10
10
|
|
11
|
+
## How To Install
|
12
|
+
|
13
|
+
Like other rubygems, just:
|
14
|
+
```
|
15
|
+
gem install slasher
|
16
|
+
```
|
17
|
+
|
18
|
+
or put this on your `Gemfile`
|
19
|
+
|
20
|
+
```
|
21
|
+
gem 'slasher'
|
22
|
+
```
|
23
|
+
|
11
24
|
## How To Use
|
12
25
|
|
13
26
|
To use the library, you need to have an HTML document first.
|
@@ -30,7 +43,5 @@ This library has been tested against some websites and you can see the complete
|
|
30
43
|
|
31
44
|
## TODO
|
32
45
|
1. Add more test cases: international websites
|
33
|
-
2.
|
34
|
-
3.
|
35
|
-
4. Move test to travis
|
36
|
-
5. Better information for gem
|
46
|
+
2. Performance analysis
|
47
|
+
3. Better API documentation
|
data/changelog.md
ADDED
data/doc/website_coverage.txt
CHANGED
data/lib/slasher/content.rb
CHANGED
data/lib/slasher/dom.rb
CHANGED
@@ -24,20 +24,16 @@ class Slasher
|
|
24
24
|
end
|
25
25
|
|
26
26
|
def get_paragraphs_content(node)
|
27
|
-
|
28
|
-
|
29
|
-
content += p.text
|
27
|
+
node.send(:>, "p").map do |p|
|
28
|
+
p.text
|
30
29
|
p.remove
|
31
|
-
end
|
32
|
-
content
|
30
|
+
end.join(" ")
|
33
31
|
end
|
34
32
|
|
35
33
|
def get_texts(node)
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
end
|
40
|
-
content
|
34
|
+
node.children.map do |child|
|
35
|
+
child.text.delete("\n").strip if child.text?
|
36
|
+
end.join
|
41
37
|
end
|
42
38
|
end
|
43
39
|
end
|
data/lib/slasher.rb
CHANGED
@@ -14,20 +14,22 @@ class Slasher
|
|
14
14
|
|
15
15
|
doc.children.each do |child|
|
16
16
|
if child.send(:>, "p").count > 0
|
17
|
-
|
18
|
-
content.push_content(p_content)
|
17
|
+
content.push_content dom.get_paragraphs_content(child)
|
19
18
|
end
|
20
19
|
|
21
20
|
if child.children.count > 0
|
22
21
|
recursive_slash(child)
|
23
22
|
else
|
24
|
-
if child.text != '' && !child.text.nil?
|
25
|
-
content.push_content(child.text)
|
26
|
-
end
|
23
|
+
content.push_content(child.text) if child.text != '' && !child.text.nil?
|
27
24
|
end
|
28
25
|
end
|
29
26
|
end
|
30
27
|
|
28
|
+
def reset(html)
|
29
|
+
@dom = Slasher::DOM.new(html)
|
30
|
+
@content = Slasher::Content.new
|
31
|
+
end
|
32
|
+
|
31
33
|
def slash
|
32
34
|
dom.remove_elements
|
33
35
|
dom.strip_elements
|
data/slasher.gemspec
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Gem::Specification.new do |gem|
|
2
|
+
gem.name = 'slasher'
|
3
|
+
gem.version = '0.5.3'
|
4
|
+
gem.date = '2015-08-02'
|
5
|
+
gem.summary = 'Extract the content of an HTML article'
|
6
|
+
gem.description = 'This gem could extract the real content of and HTML article based on weight of words in HTML dom nodes.'
|
7
|
+
gem.authors = ['Hafiz Badrie Lubis']
|
8
|
+
gem.email = 'hafizbadrie@gmail.com'
|
9
|
+
gem.files = `git ls-files`.split($\)
|
10
|
+
gem.homepage = 'http://github.com/hafizbadrie/slasherrb'
|
11
|
+
gem.license = 'MIT'
|
12
|
+
|
13
|
+
gem.add_development_dependency 'rspec', '~> 3.2'
|
14
|
+
gem.add_development_dependency 'rspec-collection_matchers', '~> 1.1'
|
15
|
+
gem.add_development_dependency 'capybara', '~> 2.4'
|
16
|
+
gem.add_development_dependency 'pry', '~> 0.10'
|
17
|
+
gem.add_runtime_dependency 'nokogiri', '~> 1.6'
|
18
|
+
end
|
@@ -8,7 +8,7 @@ describe Slasher::Content do
|
|
8
8
|
end
|
9
9
|
|
10
10
|
describe "#push_content" do
|
11
|
-
let(:content_1) { "This is just a content that
|
11
|
+
let(:content_1) { "This is just a content that\nneeds to be stored in a collection" }
|
12
12
|
let(:content_2) { "This is just a content" }
|
13
13
|
let(:content) { Slasher::Content.new }
|
14
14
|
|
@@ -16,8 +16,8 @@ describe Slasher::Content do
|
|
16
16
|
content.push_content(content_1)
|
17
17
|
content.push_content(content_2)
|
18
18
|
expect(content.collection).to have(2).items
|
19
|
-
expect(content.collection.first[:length]).to eq content_1.gsub(/\s/, '').size
|
20
|
-
expect(content.collection.first[:content]).to eq content_1
|
19
|
+
expect(content.collection.first[:length]).to eq content_1.gsub(/\s/, '').delete("\n").size
|
20
|
+
expect(content.collection.first[:content]).to eq content_1.delete("\n")
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
data/spec/slasher/dom_spec.rb
CHANGED
@@ -51,7 +51,7 @@ describe Slasher::DOM do
|
|
51
51
|
|
52
52
|
it "will get all the content inside tag p from specific parent" do
|
53
53
|
content = dom.get_paragraphs_content(dom.document.xpath("//div[@class='content']"))
|
54
|
-
expect(content).to eq "This is first paragraph.This is second paragraph.This is third paragraph."
|
54
|
+
expect(content).to eq "This is first paragraph. This is second paragraph. This is third paragraph."
|
55
55
|
|
56
56
|
content = dom.get_paragraphs_content(dom.document.xpath("//div[@class='sidebar']"))
|
57
57
|
expect(content).to eq "This is paragraph"
|
data/spec/slasher_spec.rb
CHANGED
@@ -27,7 +27,19 @@ describe Slasher do
|
|
27
27
|
|
28
28
|
it "will return the longest/highest content" do
|
29
29
|
content = slasher.slash
|
30
|
-
expect(content).to eq "This is first paragraph.This is second paragraph.This is third paragraph."
|
30
|
+
expect(content).to eq "This is first paragraph. This is second paragraph. This is third paragraph."
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "#reset" do
|
35
|
+
let(:html) { File.open("spec/fixtures/test_doc.html") }
|
36
|
+
let(:slasher) { Slasher.new(html) }
|
37
|
+
let(:new_html) { File.open("spec/fixtures/test.html") }
|
38
|
+
|
39
|
+
it "will reset the document and content attributes into a new one" do
|
40
|
+
document = slasher.dom.document
|
41
|
+
slasher.reset(new_html)
|
42
|
+
expect(slasher.dom.document).not_to eq document
|
31
43
|
end
|
32
44
|
end
|
33
45
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slasher
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Hafiz Badrie Lubis
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-08-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -92,10 +92,12 @@ files:
|
|
92
92
|
- Gemfile
|
93
93
|
- Gemfile.lock
|
94
94
|
- README.md
|
95
|
+
- changelog.md
|
95
96
|
- doc/website_coverage.txt
|
96
97
|
- lib/slasher.rb
|
97
98
|
- lib/slasher/content.rb
|
98
99
|
- lib/slasher/dom.rb
|
100
|
+
- slasher.gemspec
|
99
101
|
- spec/fixtures/test.html
|
100
102
|
- spec/fixtures/test_doc.html
|
101
103
|
- spec/fixtures/test_paragraph.html
|