textract 0.0.7 → 0.0.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a0d8ffc9fc175116ed6a851cc12a992b4c2eb95b
4
- data.tar.gz: d6093955a7cf136b0430246eebd62773f0c36d6f
3
+ metadata.gz: f61cb21afc706941ebc3f09756dd153f7e6bd4ae
4
+ data.tar.gz: 78905e74279b209a7ccce1095b543050f749bb6f
5
5
  SHA512:
6
- metadata.gz: 2ed2d11aa3988cbc81b31b7da07e9d74d09a7f1b057ca450d7fa78d1bd919a4f4dc83caf691a81e9bdb16fd58c848b723a1a27c77f0be3e503d8bce65bc8b5ad
7
- data.tar.gz: 377327580500633d4f21ead9adfc610ce7fba65b752525ca5c6879e615e412732b228fd489789f5166f8c1971e5b8ec75207aa722fae96467af41becf67acfda
6
+ metadata.gz: 9197c83b96dda8e79e3c88c08cb2aec64e3fa6b019722df9c92d7e84151d54390df9db18024f7cd5b2c00efeb4b314dafd8e48aa3bda89e31f97211ae122655d
7
+ data.tar.gz: c454baf54a7be0032972c4bb20e2ca87526942cf40f35a6bb83025588f299ef03c3dff0e25af278068c14211a10125f083267244bd7f9e28f9d65d3c3150c06a
data/lib/textract.rb CHANGED
@@ -7,6 +7,10 @@ require 'readability'
7
7
 
8
8
  module Textract
9
9
  # attr_accessor :client
10
+ TAG_WHITELIST = %w[
11
+ div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong
12
+ figure
13
+ ]
10
14
 
11
15
  def self.get_text(url, selectors=nil, format="markdown")
12
16
  @client = Client.new(url, selectors, format)
@@ -43,7 +47,7 @@ module Textract
43
47
  article_el = doc
44
48
  end
45
49
  Readability::Document.new(article_el.to_s,
46
- tags: %w[div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong],
50
+ tags: TAG_WHITELIST,
47
51
  attributes: %w[src href],
48
52
  remove_empty_nodes: false,
49
53
  )
@@ -1,3 +1,3 @@
1
1
  module Textract
2
- VERSION = "0.0.7"
2
+ VERSION = "0.0.8"
3
3
  end
@@ -5,7 +5,7 @@ RSpec.configure do |c|
5
5
  c.filter_run :focus => true
6
6
  end
7
7
 
8
- describe Textract do
8
+ describe Textract, :focus do
9
9
  it "initializes with the get_text method" do
10
10
  url = "http://www.tedcruz.org/about/"
11
11
  article = Textract.get_text(url)
@@ -15,12 +15,12 @@ describe Textract do
15
15
  it "returns article text based on article tag" do
16
16
  url = "http://gawker.com/1694508525"
17
17
  article = Textract.get_text(url)
18
- expect(article.text[0..5]).to eq "Import"
19
- expect(article.md5).to eq "ae57104339fbd6455a91f8ebdc94b90c"
18
+ expect(article.text.include?("Import")).to eq true
19
+ expect(article.md5).to eq "c11a810a3e73f24aac78fd3e39e69f87"
20
20
  expect(article.author).to eq "Hamilton Nolan"
21
21
  end
22
22
 
23
- it "also includes images", :focus do
23
+ it "also includes images" do
24
24
  url = "http://gawker.com/1696731611"
25
25
  img = "http://i.kinja-img.com/gawker-media/image/upload/s--fWYFlEv6--/c_fit,fl_progressive,q_80,w_636/l3sjlg0ariqomd4ubtl6.jpg"
26
26
  article = Textract.get_text(url)
@@ -30,7 +30,7 @@ describe Textract do
30
30
  it "returns article text based on opengraph description" do
31
31
  url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
32
32
  article = Textract.get_text(url)
33
- expect(article.text[0..5]).to eq "Ted Cr"
33
+ expect(article.text.include?("Ted Cruz")).to eq true
34
34
  end
35
35
 
36
36
  it "can find a twitter profile given a selector" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.7
4
+ version: 0.0.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Pash