textract 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/textract.rb +5 -1
- data/lib/textract/version.rb +1 -1
- data/spec/lib/textract_spec.rb +5 -5
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f61cb21afc706941ebc3f09756dd153f7e6bd4ae
|
4
|
+
data.tar.gz: 78905e74279b209a7ccce1095b543050f749bb6f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9197c83b96dda8e79e3c88c08cb2aec64e3fa6b019722df9c92d7e84151d54390df9db18024f7cd5b2c00efeb4b314dafd8e48aa3bda89e31f97211ae122655d
|
7
|
+
data.tar.gz: c454baf54a7be0032972c4bb20e2ca87526942cf40f35a6bb83025588f299ef03c3dff0e25af278068c14211a10125f083267244bd7f9e28f9d65d3c3150c06a
|
data/lib/textract.rb
CHANGED
@@ -7,6 +7,10 @@ require 'readability'
|
|
7
7
|
|
8
8
|
module Textract
|
9
9
|
# attr_accessor :client
|
10
|
+
TAG_WHITELIST = %w[
|
11
|
+
div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong
|
12
|
+
figure
|
13
|
+
]
|
10
14
|
|
11
15
|
def self.get_text(url, selectors=nil, format="markdown")
|
12
16
|
@client = Client.new(url, selectors, format)
|
@@ -43,7 +47,7 @@ module Textract
|
|
43
47
|
article_el = doc
|
44
48
|
end
|
45
49
|
Readability::Document.new(article_el.to_s,
|
46
|
-
tags:
|
50
|
+
tags: TAG_WHITELIST,
|
47
51
|
attributes: %w[src href],
|
48
52
|
remove_empty_nodes: false,
|
49
53
|
)
|
data/lib/textract/version.rb
CHANGED
data/spec/lib/textract_spec.rb
CHANGED
@@ -5,7 +5,7 @@ RSpec.configure do |c|
|
|
5
5
|
c.filter_run :focus => true
|
6
6
|
end
|
7
7
|
|
8
|
-
describe Textract do
|
8
|
+
describe Textract, :focus do
|
9
9
|
it "initializes with the get_text method" do
|
10
10
|
url = "http://www.tedcruz.org/about/"
|
11
11
|
article = Textract.get_text(url)
|
@@ -15,12 +15,12 @@ describe Textract do
|
|
15
15
|
it "returns article text based on article tag" do
|
16
16
|
url = "http://gawker.com/1694508525"
|
17
17
|
article = Textract.get_text(url)
|
18
|
-
expect(article.text
|
19
|
-
expect(article.md5).to eq "
|
18
|
+
expect(article.text.include?("Import")).to eq true
|
19
|
+
expect(article.md5).to eq "c11a810a3e73f24aac78fd3e39e69f87"
|
20
20
|
expect(article.author).to eq "Hamilton Nolan"
|
21
21
|
end
|
22
22
|
|
23
|
-
it "also includes images"
|
23
|
+
it "also includes images" do
|
24
24
|
url = "http://gawker.com/1696731611"
|
25
25
|
img = "http://i.kinja-img.com/gawker-media/image/upload/s--fWYFlEv6--/c_fit,fl_progressive,q_80,w_636/l3sjlg0ariqomd4ubtl6.jpg"
|
26
26
|
article = Textract.get_text(url)
|
@@ -30,7 +30,7 @@ describe Textract do
|
|
30
30
|
it "returns article text based on opengraph description" do
|
31
31
|
url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
|
32
32
|
article = Textract.get_text(url)
|
33
|
-
expect(article.text
|
33
|
+
expect(article.text.include?("Ted Cruz")).to eq true
|
34
34
|
end
|
35
35
|
|
36
36
|
it "can find a twitter profile given a selector" do
|