textract 0.0.7 → 0.0.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a0d8ffc9fc175116ed6a851cc12a992b4c2eb95b
4
- data.tar.gz: d6093955a7cf136b0430246eebd62773f0c36d6f
3
+ metadata.gz: f61cb21afc706941ebc3f09756dd153f7e6bd4ae
4
+ data.tar.gz: 78905e74279b209a7ccce1095b543050f749bb6f
5
5
  SHA512:
6
- metadata.gz: 2ed2d11aa3988cbc81b31b7da07e9d74d09a7f1b057ca450d7fa78d1bd919a4f4dc83caf691a81e9bdb16fd58c848b723a1a27c77f0be3e503d8bce65bc8b5ad
7
- data.tar.gz: 377327580500633d4f21ead9adfc610ce7fba65b752525ca5c6879e615e412732b228fd489789f5166f8c1971e5b8ec75207aa722fae96467af41becf67acfda
6
+ metadata.gz: 9197c83b96dda8e79e3c88c08cb2aec64e3fa6b019722df9c92d7e84151d54390df9db18024f7cd5b2c00efeb4b314dafd8e48aa3bda89e31f97211ae122655d
7
+ data.tar.gz: c454baf54a7be0032972c4bb20e2ca87526942cf40f35a6bb83025588f299ef03c3dff0e25af278068c14211a10125f083267244bd7f9e28f9d65d3c3150c06a
data/lib/textract.rb CHANGED
@@ -7,6 +7,10 @@ require 'readability'
7
7
 
8
8
  module Textract
9
9
  # attr_accessor :client
10
+ TAG_WHITELIST = %w[
11
+ div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong
12
+ figure
13
+ ]
10
14
 
11
15
  def self.get_text(url, selectors=nil, format="markdown")
12
16
  @client = Client.new(url, selectors, format)
@@ -43,7 +47,7 @@ module Textract
43
47
  article_el = doc
44
48
  end
45
49
  Readability::Document.new(article_el.to_s,
46
- tags: %w[div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong],
50
+ tags: TAG_WHITELIST,
47
51
  attributes: %w[src href],
48
52
  remove_empty_nodes: false,
49
53
  )
@@ -1,3 +1,3 @@
1
1
  module Textract
2
- VERSION = "0.0.7"
2
+ VERSION = "0.0.8"
3
3
  end
@@ -5,7 +5,7 @@ RSpec.configure do |c|
5
5
  c.filter_run :focus => true
6
6
  end
7
7
 
8
- describe Textract do
8
+ describe Textract, :focus do
9
9
  it "initializes with the get_text method" do
10
10
  url = "http://www.tedcruz.org/about/"
11
11
  article = Textract.get_text(url)
@@ -15,12 +15,12 @@ describe Textract do
15
15
  it "returns article text based on article tag" do
16
16
  url = "http://gawker.com/1694508525"
17
17
  article = Textract.get_text(url)
18
- expect(article.text[0..5]).to eq "Import"
19
- expect(article.md5).to eq "ae57104339fbd6455a91f8ebdc94b90c"
18
+ expect(article.text.include?("Import")).to eq true
19
+ expect(article.md5).to eq "c11a810a3e73f24aac78fd3e39e69f87"
20
20
  expect(article.author).to eq "Hamilton Nolan"
21
21
  end
22
22
 
23
- it "also includes images", :focus do
23
+ it "also includes images" do
24
24
  url = "http://gawker.com/1696731611"
25
25
  img = "http://i.kinja-img.com/gawker-media/image/upload/s--fWYFlEv6--/c_fit,fl_progressive,q_80,w_636/l3sjlg0ariqomd4ubtl6.jpg"
26
26
  article = Textract.get_text(url)
@@ -30,7 +30,7 @@ describe Textract do
30
30
  it "returns article text based on opengraph description" do
31
31
  url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
32
32
  article = Textract.get_text(url)
33
- expect(article.text[0..5]).to eq "Ted Cr"
33
+ expect(article.text.include?("Ted Cruz")).to eq true
34
34
  end
35
35
 
36
36
  it "can find a twitter profile given a selector" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Pash