RubyGems - textract - Versions diffs - 0.0.7 → 0.0.8 - Mend

textract 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: a0d8ffc9fc175116ed6a851cc12a992b4c2eb95b
-  data.tar.gz: d6093955a7cf136b0430246eebd62773f0c36d6f
+  metadata.gz: f61cb21afc706941ebc3f09756dd153f7e6bd4ae
+  data.tar.gz: 78905e74279b209a7ccce1095b543050f749bb6f
 SHA512:
-  metadata.gz: 2ed2d11aa3988cbc81b31b7da07e9d74d09a7f1b057ca450d7fa78d1bd919a4f4dc83caf691a81e9bdb16fd58c848b723a1a27c77f0be3e503d8bce65bc8b5ad
-  data.tar.gz: 377327580500633d4f21ead9adfc610ce7fba65b752525ca5c6879e615e412732b228fd489789f5166f8c1971e5b8ec75207aa722fae96467af41becf67acfda
+  metadata.gz: 9197c83b96dda8e79e3c88c08cb2aec64e3fa6b019722df9c92d7e84151d54390df9db18024f7cd5b2c00efeb4b314dafd8e48aa3bda89e31f97211ae122655d
+  data.tar.gz: c454baf54a7be0032972c4bb20e2ca87526942cf40f35a6bb83025588f299ef03c3dff0e25af278068c14211a10125f083267244bd7f9e28f9d65d3c3150c06a

data/lib/textract.rb CHANGED Viewed

@@ -7,6 +7,10 @@ require 'readability'
 module Textract
   # attr_accessor :client
+  TAG_WHITELIST = %w[
+    div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong
+    figure
+  ]
   def self.get_text(url, selectors=nil, format="markdown")
     @client = Client.new(url, selectors, format)
@@ -43,7 +47,7 @@ module Textract
       article_el = doc
     end
     Readability::Document.new(article_el.to_s,
-                                        tags: %w[div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong],
+                                        tags: TAG_WHITELIST,
                                         attributes: %w[src href],
                                         remove_empty_nodes: false,
                                        )

data/lib/textract/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Textract
-  VERSION = "0.0.7"
+  VERSION = "0.0.8"
 end

data/spec/lib/textract_spec.rb CHANGED Viewed

@@ -5,7 +5,7 @@ RSpec.configure do |c|
   c.filter_run :focus => true
 end
-describe Textract do
+describe Textract, :focus do
   it "initializes with the get_text method" do
     url = "http://www.tedcruz.org/about/"
     article = Textract.get_text(url)
@@ -15,12 +15,12 @@ describe Textract do
   it "returns article text based on article tag" do
     url = "http://gawker.com/1694508525"
     article = Textract.get_text(url)
-    expect(article.text[0..5]).to eq "Import"
-    expect(article.md5).to eq "ae57104339fbd6455a91f8ebdc94b90c"
+    expect(article.text.include?("Import")).to eq true
+    expect(article.md5).to eq "c11a810a3e73f24aac78fd3e39e69f87"
     expect(article.author).to eq "Hamilton Nolan"
   end
-  it "also includes images", :focus do
+  it "also includes images" do
     url = "http://gawker.com/1696731611"
     img = "http://i.kinja-img.com/gawker-media/image/upload/s--fWYFlEv6--/c_fit,fl_progressive,q_80,w_636/l3sjlg0ariqomd4ubtl6.jpg"
     article = Textract.get_text(url)
@@ -30,7 +30,7 @@ describe Textract do
   it "returns article text based on opengraph description" do
     url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
     article = Textract.get_text(url)
-    expect(article.text[0..5]).to eq "Ted Cr"
+    expect(article.text.include?("Ted Cruz")).to eq true
   end
   it "can find a twitter profile given a selector" do

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: textract
 version: !ruby/object:Gem::Version
-  version: 0.0.7
+  version: 0.0.8
 platform: ruby
 authors:
 - Adam Pash