textract 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/textract.rb +5 -1
- data/lib/textract/version.rb +1 -1
- data/spec/lib/textract_spec.rb +5 -5
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f61cb21afc706941ebc3f09756dd153f7e6bd4ae
|
4
|
+
data.tar.gz: 78905e74279b209a7ccce1095b543050f749bb6f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9197c83b96dda8e79e3c88c08cb2aec64e3fa6b019722df9c92d7e84151d54390df9db18024f7cd5b2c00efeb4b314dafd8e48aa3bda89e31f97211ae122655d
|
7
|
+
data.tar.gz: c454baf54a7be0032972c4bb20e2ca87526942cf40f35a6bb83025588f299ef03c3dff0e25af278068c14211a10125f083267244bd7f9e28f9d65d3c3150c06a
|
data/lib/textract.rb
CHANGED
@@ -7,6 +7,10 @@ require 'readability'
|
|
7
7
|
|
8
8
|
module Textract
|
9
9
|
# attr_accessor :client
|
10
|
+
TAG_WHITELIST = %w[
|
11
|
+
div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong
|
12
|
+
figure
|
13
|
+
]
|
10
14
|
|
11
15
|
def self.get_text(url, selectors=nil, format="markdown")
|
12
16
|
@client = Client.new(url, selectors, format)
|
@@ -43,7 +47,7 @@ module Textract
|
|
43
47
|
article_el = doc
|
44
48
|
end
|
45
49
|
Readability::Document.new(article_el.to_s,
|
46
|
-
tags:
|
50
|
+
tags: TAG_WHITELIST,
|
47
51
|
attributes: %w[src href],
|
48
52
|
remove_empty_nodes: false,
|
49
53
|
)
|
data/lib/textract/version.rb
CHANGED
data/spec/lib/textract_spec.rb
CHANGED
@@ -5,7 +5,7 @@ RSpec.configure do |c|
|
|
5
5
|
c.filter_run :focus => true
|
6
6
|
end
|
7
7
|
|
8
|
-
describe Textract do
|
8
|
+
describe Textract, :focus do
|
9
9
|
it "initializes with the get_text method" do
|
10
10
|
url = "http://www.tedcruz.org/about/"
|
11
11
|
article = Textract.get_text(url)
|
@@ -15,12 +15,12 @@ describe Textract do
|
|
15
15
|
it "returns article text based on article tag" do
|
16
16
|
url = "http://gawker.com/1694508525"
|
17
17
|
article = Textract.get_text(url)
|
18
|
-
expect(article.text
|
19
|
-
expect(article.md5).to eq "
|
18
|
+
expect(article.text.include?("Import")).to eq true
|
19
|
+
expect(article.md5).to eq "c11a810a3e73f24aac78fd3e39e69f87"
|
20
20
|
expect(article.author).to eq "Hamilton Nolan"
|
21
21
|
end
|
22
22
|
|
23
|
-
it "also includes images"
|
23
|
+
it "also includes images" do
|
24
24
|
url = "http://gawker.com/1696731611"
|
25
25
|
img = "http://i.kinja-img.com/gawker-media/image/upload/s--fWYFlEv6--/c_fit,fl_progressive,q_80,w_636/l3sjlg0ariqomd4ubtl6.jpg"
|
26
26
|
article = Textract.get_text(url)
|
@@ -30,7 +30,7 @@ describe Textract do
|
|
30
30
|
it "returns article text based on opengraph description" do
|
31
31
|
url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
|
32
32
|
article = Textract.get_text(url)
|
33
|
-
expect(article.text
|
33
|
+
expect(article.text.include?("Ted Cruz")).to eq true
|
34
34
|
end
|
35
35
|
|
36
36
|
it "can find a twitter profile given a selector" do
|