RubyGems - artext - Versions diffs - 0.0.1 → 0.0.2 - Mend

artext 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: dab606c0c96c6939da80ddf9e5ae61ecb905d90f
-  data.tar.gz: 7df963e4a642b499c776b844e2da9901e58dfefb
+  metadata.gz: 379fd2eab84219f4a82393817354930f7e6cbacd
+  data.tar.gz: cee44b5160c701f51c1836d6cd0a27d655ecf4bb
 SHA512:
-  metadata.gz: 13819faa7e432065a235b69ad2bcd326015716dff27e1bea9b8b7929e3c4b2d0a0ca77e2b16ee1f276714fe07bf066f14b563ad9c1d5e762859c414b7e7991dc
-  data.tar.gz: 8e8c98d7d52e137272181661a30921ef9d49af6cfd419cdbc9fb78c1b8f484bea879c93a2fac970c3a59d76bc1643df3a6f308b9436a480b05fa313c82605b1f
+  metadata.gz: e505ce5c26c90ecd95ffee18656626954276fc5333e607c158b650f2ded704a7b69a2181904555ae106f25e252488ca878b3aa58bccbd7fda21145c013290ac2
+  data.tar.gz: a6ffbd0ebe58681e9ecb9e0554e3aa0fc9ff446fe082ce8ab463362e16a83b1d91ddfdc30e6ecb08adf1a1b1e679ee363d2fcebf443b1a27b58fc260aa09d02b

data/Gemfile.lock CHANGED

@@ -48,7 +48,7 @@ DEPENDENCIES
   artext!
   bundler (~> 1.6)
   rake (~> 10.0)
-  rspec
+  rspec (~> 3.3)
 BUNDLED WITH
    1.10.6

data/README.md CHANGED

@@ -1,6 +1,6 @@
 # Artext
-TODO: Write a gem description
+Artext is a gem to extract articles from webpages. It removes all advertisement and additional content, and only shows the core content of the article. It can be helpful for applications that show article view.
 ## Installation
@@ -20,10 +20,43 @@ Or install it yourself as:
 ## Usage
-TODO: Write usage instructions here
+```ruby
+require 'artext'
+url = "http://techcrunch.com/2015/07/07/meet-lynx-an-app-for-sharing-links-with-friends/"
+Artext.extract(url)
+```
+## Response
+```ruby
+response =
+	{
+		:url => "http://techcrunch.com/2015/07/07/meet-lynx-an-app-for-sharing-links-with-friends/"
+		:data => [{
+				:image => "OG IMAGE",
+				:title => "OG TITLE",
+				:tags => ["KEYWORDS AND TAGS"],
+				:type => "OG TYPE",
+				:favicon => "FAVICON URL",
+				:theme => "DOMINANT COLOR IN THE FAVICON",
+			}]
+		:article => [{
+				:body => "SANITIZED HTML OF THE CORE CONTENT",
+				:text => "TEXT OF THE CORE CONTENT",
+				:images => ["ARRAY OF IMAGE URLS IN THE SANITIZED HTML"],
+				:date => "PUBLISHING DATE OF THE ARTICLE",
+				:author => ["PUBLISHING AUTHOR(s)"],
+				:score => SCORE BETWEEN 0 AND 1 RELATING TO HOW SUCCESSFULLY THE CONTENT WAS EXTRACTED,
+			}]
+	}
+```
 ## Contributing
+Thank you @amitsaxena for your inputs.
 1. Fork it ( https://github.com/[my-github-username]/artext/fork )
 2. Create your feature branch (`git checkout -b my-new-feature`)
 3. Commit your changes (`git commit -am 'Add some feature'`)

data/lib/artext.rb CHANGED

@@ -121,12 +121,11 @@ module Artext
     if (article.count > 1)
       article = get_correct_article(article)
       score = 0.8
-    end
-    if (!is_blank?(article))
+    end
+    if (is_blank?(article))
       article = find_article(doc)
       score = 0.6
     end
-    score = score - 0.5 if (type != "article")
     if (is_blank?(article))
       # image url
       begin
@@ -139,6 +138,7 @@ module Artext
     else
       article = remove_unwanted_items_from(article)
       article, score = find_relevant(article, score)
+      score = score - 0.5 if (type != "article")
       if (score > 0.9)
         html, imgs = iteratively_clean(article, "", [], score)
       else
@@ -163,7 +163,7 @@ module Artext
             rel = ps
           end
         end
-        if ((last_p > 5 && last_p == max_p) || max_p - total_p < 2)
+        if ((last_p > 5 && last_p == max_p) || total_p - max_p < 2)
           score = 0.95 if (score < 1)
           break
         end
@@ -224,7 +224,7 @@ module Artext
       end
     elsif (element.name == "h1" || element.name == "h2" || element.name == "h3" || element.name == "h4")
       tv = "<h2>#{element.text.split.join(" ")}</h2>" if (!is_blank?(element.text.split.join(" ")))
-    elsif (element.name == "p" || element.name == "div")
+    elsif (element.name == "p")
       p_elem, ti = extractp(element, score)
       tv = "<p>#{p_elem}</p>" if (!is_blank?(p_elem))
       images = images + ti if (!is_blank?(ti))
@@ -243,6 +243,16 @@ module Artext
     elsif (element.name == "ol" || element.name == "ul")
       tv, ti = listhandle(element)
       images = images + ti
+    elsif (element.name == "div" || element.name == "span")
+      html = ""
+      imgs = []
+      element.children.each do |elem|
+        tv, ti = get_element_html(elem, [], score)
+        html = html + tv if (!is_blank?(tv))
+        imgs = imgs + ti if (!is_blank?(ti))
+      end
+      tv = html
+      images = imgs
     end
     return tv, images
   end
@@ -356,7 +366,7 @@ module Artext
   end
   def self.remove_unwanted_items_from(article)
-    unwanted_elements = ["//script", "//comment()", "//aside", ".aside", "iframe", "//noscript", "//form"]
+    unwanted_elements = ["//script", "//comment()", "//aside", ".aside", "iframe", "//noscript", "//form", "//header", "//footer"]
     unwanted_elements.each do |elem|
       article.search("#{elem}").remove
     end

data/lib/artext/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Artext
-  VERSION = "0.0.1"
+  VERSION = "0.0.2"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: artext
 version: !ruby/object:Gem::Version
-  version: 0.0.1
+  version: 0.0.2
 platform: ruby
 authors:
 - Anindya Mondal
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-09-12 00:00:00.000000000 Z
+date: 2015-09-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler