slasher 0.5.2 → 0.5.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e1ee36cabdce72e25caa6ea235041ffb0e97803d
4
- data.tar.gz: daf78e2cffaaf35d5499e66515c4e0a4e3eb6788
3
+ metadata.gz: 7f0cfc1f144aae023283e6686c5c48299b25dda1
4
+ data.tar.gz: fb9f0d069941524e3b318614b63e0dc04174544e
5
5
  SHA512:
6
- metadata.gz: b846cf03de7719317ae426ba8698510246875797f5bd956e86c07c27803e69b872256eaead22549e37d4c9f5c18ffe177fd21ce90d5d2276afa45c64158315b9
7
- data.tar.gz: f9f9b685d6b03f3528d05386b5b99527f8dc85889e4e5bcb7040f362f5076df74fe365cc398aab496b1e99bf44d729abef05ab4e1383efb69580c965e0b621c4
6
+ metadata.gz: d978c99ee6d4536997da088598f1720952b372a36bc678b99b751d72171c5c5b338e1ffb2b8938e249788bd217f97e2ada3b091f3699910c6357668e19094cc0
7
+ data.tar.gz: cd6ecbb87fe30349031255284ec7c35602146234b8aba6158865aa1c8d3c110d4cba418168f8caed26c80d007695408081c3e2f6d641d16979dd3cd05c0be37d
data/.gitignore CHANGED
@@ -32,7 +32,8 @@ build/
32
32
 
33
33
  # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
34
34
  .rvmrc
35
- *.gemspec
35
+ #*.gemspec
36
36
 
37
37
  /spec/cases/
38
38
  /spec/cases_spec.rb
39
+ /benchmarks/
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- slasher (0.5.1)
4
+ slasher (0.5.2)
5
5
  nokogiri (~> 1.6)
6
6
 
7
7
  GEM
data/README.md CHANGED
@@ -8,6 +8,19 @@ This project is actually the ruby version of [slasherjs](https://github.com/hafi
8
8
  The result of extraction is depending of assumption on HTML document structure itself. Therefore, there may be flaws in the result if the document doesn't match the structure that is recognised by the library.
9
9
  This condition will make the library will be improved from time to time.
10
10
 
11
+ ## How To Install
12
+
13
+ Like other rubygems, just:
14
+ ```
15
+ gem install slasher
16
+ ```
17
+
18
+ or put this on your `Gemfile`
19
+
20
+ ```
21
+ gem 'slasher'
22
+ ```
23
+
11
24
  ## How To Use
12
25
 
13
26
  To use the library, you need to have an HTML document first.
@@ -30,7 +43,5 @@ This library has been tested against some websites and you can see the complete
30
43
 
31
44
  ## TODO
32
45
  1. Add more test cases: international websites
33
- 2. Anytime I want to slash a new site, I don't need to re initialize the object.
34
- 3. Add gem dependencies (nokogiri)
35
- 4. Move test to travis
36
- 5. Better information for gem
46
+ 2. Performance analysis
47
+ 3. Better API documentation
data/changelog.md ADDED
@@ -0,0 +1,3 @@
1
+ ## version: 0.5.3
2
+ * add newline when concatenating paragraphs
3
+ * remove newline when pusing content to collection
@@ -19,3 +19,8 @@
19
19
  19. jakpost.travel
20
20
  20. dailysocial.net
21
21
  21. teknojurnal.com
22
+ 22. mashable.com
23
+ 23. huffingtonpost.com
24
+ 24. techinasia.com
25
+ 25. techcrunch.com
26
+ 26. arstechnica.com
@@ -8,8 +8,8 @@ class Slasher
8
8
 
9
9
  def push_content(content)
10
10
  stored_content = {
11
- length: content.gsub(/\s/, '').size,
12
- content: content
11
+ length: content.gsub(/\s/, '').delete("\n").size,
12
+ content: content.delete("\n")
13
13
  }
14
14
  @collection << stored_content
15
15
  end
data/lib/slasher/dom.rb CHANGED
@@ -24,20 +24,16 @@ class Slasher
24
24
  end
25
25
 
26
26
  def get_paragraphs_content(node)
27
- content = ""
28
- node.send(:>, "p").each do |p|
29
- content += p.text
27
+ node.send(:>, "p").map do |p|
28
+ p.text
30
29
  p.remove
31
- end
32
- content
30
+ end.join(" ")
33
31
  end
34
32
 
35
33
  def get_texts(node)
36
- content = ""
37
- node.children.each do |child|
38
- content += child.text.delete("\n").strip if child.text?
39
- end
40
- content
34
+ node.children.map do |child|
35
+ child.text.delete("\n").strip if child.text?
36
+ end.join
41
37
  end
42
38
  end
43
39
  end
data/lib/slasher.rb CHANGED
@@ -14,20 +14,22 @@ class Slasher
14
14
 
15
15
  doc.children.each do |child|
16
16
  if child.send(:>, "p").count > 0
17
- p_content = dom.get_paragraphs_content(child)
18
- content.push_content(p_content)
17
+ content.push_content dom.get_paragraphs_content(child)
19
18
  end
20
19
 
21
20
  if child.children.count > 0
22
21
  recursive_slash(child)
23
22
  else
24
- if child.text != '' && !child.text.nil?
25
- content.push_content(child.text)
26
- end
23
+ content.push_content(child.text) if child.text != '' && !child.text.nil?
27
24
  end
28
25
  end
29
26
  end
30
27
 
28
+ def reset(html)
29
+ @dom = Slasher::DOM.new(html)
30
+ @content = Slasher::Content.new
31
+ end
32
+
31
33
  def slash
32
34
  dom.remove_elements
33
35
  dom.strip_elements
data/slasher.gemspec ADDED
@@ -0,0 +1,18 @@
1
+ Gem::Specification.new do |gem|
2
+ gem.name = 'slasher'
3
+ gem.version = '0.5.3'
4
+ gem.date = '2015-08-02'
5
+ gem.summary = 'Extract the content of an HTML article'
6
+ gem.description = 'This gem could extract the real content of and HTML article based on weight of words in HTML dom nodes.'
7
+ gem.authors = ['Hafiz Badrie Lubis']
8
+ gem.email = 'hafizbadrie@gmail.com'
9
+ gem.files = `git ls-files`.split($\)
10
+ gem.homepage = 'http://github.com/hafizbadrie/slasherrb'
11
+ gem.license = 'MIT'
12
+
13
+ gem.add_development_dependency 'rspec', '~> 3.2'
14
+ gem.add_development_dependency 'rspec-collection_matchers', '~> 1.1'
15
+ gem.add_development_dependency 'capybara', '~> 2.4'
16
+ gem.add_development_dependency 'pry', '~> 0.10'
17
+ gem.add_runtime_dependency 'nokogiri', '~> 1.6'
18
+ end
@@ -8,7 +8,7 @@ describe Slasher::Content do
8
8
  end
9
9
 
10
10
  describe "#push_content" do
11
- let(:content_1) { "This is just a content that needs to be stored in a collection" }
11
+ let(:content_1) { "This is just a content that\nneeds to be stored in a collection" }
12
12
  let(:content_2) { "This is just a content" }
13
13
  let(:content) { Slasher::Content.new }
14
14
 
@@ -16,8 +16,8 @@ describe Slasher::Content do
16
16
  content.push_content(content_1)
17
17
  content.push_content(content_2)
18
18
  expect(content.collection).to have(2).items
19
- expect(content.collection.first[:length]).to eq content_1.gsub(/\s/, '').size
20
- expect(content.collection.first[:content]).to eq content_1
19
+ expect(content.collection.first[:length]).to eq content_1.gsub(/\s/, '').delete("\n").size
20
+ expect(content.collection.first[:content]).to eq content_1.delete("\n")
21
21
  end
22
22
  end
23
23
 
@@ -51,7 +51,7 @@ describe Slasher::DOM do
51
51
 
52
52
  it "will get all the content inside tag p from specific parent" do
53
53
  content = dom.get_paragraphs_content(dom.document.xpath("//div[@class='content']"))
54
- expect(content).to eq "This is first paragraph.This is second paragraph.This is third paragraph."
54
+ expect(content).to eq "This is first paragraph. This is second paragraph. This is third paragraph."
55
55
 
56
56
  content = dom.get_paragraphs_content(dom.document.xpath("//div[@class='sidebar']"))
57
57
  expect(content).to eq "This is paragraph"
data/spec/slasher_spec.rb CHANGED
@@ -27,7 +27,19 @@ describe Slasher do
27
27
 
28
28
  it "will return the longest/highest content" do
29
29
  content = slasher.slash
30
- expect(content).to eq "This is first paragraph.This is second paragraph.This is third paragraph."
30
+ expect(content).to eq "This is first paragraph. This is second paragraph. This is third paragraph."
31
+ end
32
+ end
33
+
34
+ describe "#reset" do
35
+ let(:html) { File.open("spec/fixtures/test_doc.html") }
36
+ let(:slasher) { Slasher.new(html) }
37
+ let(:new_html) { File.open("spec/fixtures/test.html") }
38
+
39
+ it "will reset the document and content attributes into a new one" do
40
+ document = slasher.dom.document
41
+ slasher.reset(new_html)
42
+ expect(slasher.dom.document).not_to eq document
31
43
  end
32
44
  end
33
45
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slasher
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.5.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Hafiz Badrie Lubis
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-06-21 00:00:00.000000000 Z
11
+ date: 2015-08-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -92,10 +92,12 @@ files:
92
92
  - Gemfile
93
93
  - Gemfile.lock
94
94
  - README.md
95
+ - changelog.md
95
96
  - doc/website_coverage.txt
96
97
  - lib/slasher.rb
97
98
  - lib/slasher/content.rb
98
99
  - lib/slasher/dom.rb
100
+ - slasher.gemspec
99
101
  - spec/fixtures/test.html
100
102
  - spec/fixtures/test_doc.html
101
103
  - spec/fixtures/test_paragraph.html