RubyGems - slasher - Versions diffs - 0.5.2 → 0.5.3 - Mend

slasher 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e1ee36cabdce72e25caa6ea235041ffb0e97803d
-  data.tar.gz: daf78e2cffaaf35d5499e66515c4e0a4e3eb6788
+  metadata.gz: 7f0cfc1f144aae023283e6686c5c48299b25dda1
+  data.tar.gz: fb9f0d069941524e3b318614b63e0dc04174544e
 SHA512:
-  metadata.gz: b846cf03de7719317ae426ba8698510246875797f5bd956e86c07c27803e69b872256eaead22549e37d4c9f5c18ffe177fd21ce90d5d2276afa45c64158315b9
-  data.tar.gz: f9f9b685d6b03f3528d05386b5b99527f8dc85889e4e5bcb7040f362f5076df74fe365cc398aab496b1e99bf44d729abef05ab4e1383efb69580c965e0b621c4
+  metadata.gz: d978c99ee6d4536997da088598f1720952b372a36bc678b99b751d72171c5c5b338e1ffb2b8938e249788bd217f97e2ada3b091f3699910c6357668e19094cc0
+  data.tar.gz: cd6ecbb87fe30349031255284ec7c35602146234b8aba6158865aa1c8d3c110d4cba418168f8caed26c80d007695408081c3e2f6d641d16979dd3cd05c0be37d

data/.gitignore CHANGED Viewed

@@ -32,7 +32,8 @@ build/
 # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
 .rvmrc
-*.gemspec
+#*.gemspec
 /spec/cases/
 /spec/cases_spec.rb
+/benchmarks/

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    slasher (0.5.1)
+    slasher (0.5.2)
       nokogiri (~> 1.6)
 GEM

data/README.md CHANGED Viewed

@@ -8,6 +8,19 @@ This project is actually the ruby version of [slasherjs](https://github.com/hafi
 The result of extraction is depending of assumption on HTML document structure itself. Therefore, there may be flaws in the result if the document doesn't match the structure that is recognised by the library.
 This condition will make the library will be improved from time to time.
+## How To Install
+Like other rubygems, just:
+```
+gem install slasher
+```
+or put this on your `Gemfile`
+```
+gem 'slasher'
+```
 ## How To Use
 To use the library, you need to have an HTML document first.
@@ -30,7 +43,5 @@ This library has been tested against some websites and you can see the complete
 ## TODO
 1. Add more test cases: international websites
-2. Anytime I want to slash a new site, I don't need to re initialize the object.
-3. Add gem dependencies (nokogiri)
-4. Move test to travis
-5. Better information for gem
+2. Performance analysis
+3. Better API documentation

data/changelog.md ADDED Viewed

@@ -0,0 +1,3 @@
+## version: 0.5.3
+  * add newline when concatenating paragraphs
+  * remove newline when pusing content to collection

data/doc/website_coverage.txt CHANGED Viewed

@@ -19,3 +19,8 @@
 19. jakpost.travel
 20. dailysocial.net
 21. teknojurnal.com
+22. mashable.com
+23. huffingtonpost.com
+24. techinasia.com
+25. techcrunch.com
+26. arstechnica.com

data/lib/slasher/content.rb CHANGED Viewed

@@ -8,8 +8,8 @@ class Slasher
     def push_content(content)
       stored_content = {
-        length: content.gsub(/\s/, '').size,
-        content: content
+        length: content.gsub(/\s/, '').delete("\n").size,
+        content: content.delete("\n")
       }
       @collection << stored_content
     end

data/lib/slasher/dom.rb CHANGED Viewed

@@ -24,20 +24,16 @@ class Slasher
     end
     def get_paragraphs_content(node)
-      content = ""
-      node.send(:>, "p").each do |p|
-        content += p.text
+      node.send(:>, "p").map do |p|
+        p.text
         p.remove
-      end
-      content
+      end.join(" ")
     end
     def get_texts(node)
-      content = ""
-      node.children.each do |child|
-        content += child.text.delete("\n").strip if child.text?
-      end
-      content
+      node.children.map do |child|
+        child.text.delete("\n").strip if child.text?
+      end.join
     end
   end
 end

data/lib/slasher.rb CHANGED Viewed

@@ -14,20 +14,22 @@ class Slasher
     doc.children.each do |child|
       if child.send(:>, "p").count > 0
-        p_content = dom.get_paragraphs_content(child)
-        content.push_content(p_content)
+        content.push_content dom.get_paragraphs_content(child)
       end
       if child.children.count > 0
         recursive_slash(child)
       else
-        if child.text != '' && !child.text.nil?
-          content.push_content(child.text)
-        end
+        content.push_content(child.text) if child.text != '' && !child.text.nil?
       end
     end
   end
+  def reset(html)
+    @dom     = Slasher::DOM.new(html)
+    @content = Slasher::Content.new
+  end
   def slash
     dom.remove_elements
     dom.strip_elements

data/slasher.gemspec ADDED Viewed

@@ -0,0 +1,18 @@
+Gem::Specification.new do |gem|
+  gem.name        = 'slasher'
+  gem.version     = '0.5.3'
+  gem.date        = '2015-08-02'
+  gem.summary     = 'Extract the content of an HTML article'
+  gem.description = 'This gem could extract the real content of and HTML article based on weight of words in HTML dom nodes.'
+  gem.authors     = ['Hafiz Badrie Lubis']
+  gem.email       = 'hafizbadrie@gmail.com'
+  gem.files       = `git ls-files`.split($\)
+  gem.homepage    = 'http://github.com/hafizbadrie/slasherrb'
+  gem.license     = 'MIT'
+  gem.add_development_dependency 'rspec', '~> 3.2'
+  gem.add_development_dependency 'rspec-collection_matchers', '~> 1.1'
+  gem.add_development_dependency 'capybara', '~> 2.4'
+  gem.add_development_dependency 'pry', '~> 0.10'
+  gem.add_runtime_dependency 'nokogiri', '~> 1.6'
+end

data/spec/slasher/content_spec.rb CHANGED Viewed

@@ -8,7 +8,7 @@ describe Slasher::Content do
   end
   describe "#push_content" do
-    let(:content_1) { "This is just a content that needs to be stored in a collection" }
+    let(:content_1) { "This is just a content that\nneeds to be stored in a collection" }
     let(:content_2) { "This is just a content" }
     let(:content) { Slasher::Content.new }
@@ -16,8 +16,8 @@ describe Slasher::Content do
       content.push_content(content_1)
       content.push_content(content_2)
       expect(content.collection).to have(2).items
-      expect(content.collection.first[:length]).to eq content_1.gsub(/\s/, '').size
-      expect(content.collection.first[:content]).to eq content_1
+      expect(content.collection.first[:length]).to eq content_1.gsub(/\s/, '').delete("\n").size
+      expect(content.collection.first[:content]).to eq content_1.delete("\n")
     end
   end

data/spec/slasher/dom_spec.rb CHANGED Viewed

@@ -51,7 +51,7 @@ describe Slasher::DOM do
     it "will get all the content inside tag p from specific parent" do
       content = dom.get_paragraphs_content(dom.document.xpath("//div[@class='content']"))
-      expect(content).to eq "This is first paragraph.This is second paragraph.This is third paragraph."
+      expect(content).to eq "This is first paragraph. This is second paragraph. This is third paragraph."
       content = dom.get_paragraphs_content(dom.document.xpath("//div[@class='sidebar']"))
       expect(content).to eq "This is paragraph"

data/spec/slasher_spec.rb CHANGED Viewed

@@ -27,7 +27,19 @@ describe Slasher do
     it "will return the longest/highest content" do
       content = slasher.slash
-      expect(content).to eq "This is first paragraph.This is second paragraph.This is third paragraph."
+      expect(content).to eq "This is first paragraph. This is second paragraph. This is third paragraph."
+    end
+  end
+  describe "#reset" do
+    let(:html) { File.open("spec/fixtures/test_doc.html") }
+    let(:slasher) { Slasher.new(html) }
+    let(:new_html) { File.open("spec/fixtures/test.html") }
+    it "will reset the document and content attributes into a new one" do
+      document = slasher.dom.document
+      slasher.reset(new_html)
+      expect(slasher.dom.document).not_to eq document
     end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: slasher
 version: !ruby/object:Gem::Version
-  version: 0.5.2
+  version: 0.5.3
 platform: ruby
 authors:
 - Hafiz Badrie Lubis
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-06-21 00:00:00.000000000 Z
+date: 2015-08-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
@@ -92,10 +92,12 @@ files:
 - Gemfile
 - Gemfile.lock
 - README.md
+- changelog.md
 - doc/website_coverage.txt
 - lib/slasher.rb
 - lib/slasher/content.rb
 - lib/slasher/dom.rb
+- slasher.gemspec
 - spec/fixtures/test.html
 - spec/fixtures/test_doc.html
 - spec/fixtures/test_paragraph.html