textract 0.0.6.3 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/textract/version.rb +1 -1
- data/lib/textract.rb +2 -2
- data/spec/lib/textract_spec.rb +13 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a0d8ffc9fc175116ed6a851cc12a992b4c2eb95b
|
4
|
+
data.tar.gz: d6093955a7cf136b0430246eebd62773f0c36d6f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2ed2d11aa3988cbc81b31b7da07e9d74d09a7f1b057ca450d7fa78d1bd919a4f4dc83caf691a81e9bdb16fd58c848b723a1a27c77f0be3e503d8bce65bc8b5ad
|
7
|
+
data.tar.gz: 377327580500633d4f21ead9adfc610ce7fba65b752525ca5c6879e615e412732b228fd489789f5166f8c1971e5b8ec75207aa722fae96467af41becf67acfda
|
data/lib/textract/version.rb
CHANGED
data/lib/textract.rb
CHANGED
@@ -42,10 +42,10 @@ module Textract
|
|
42
42
|
else
|
43
43
|
article_el = doc
|
44
44
|
end
|
45
|
-
|
45
|
+
Readability::Document.new(article_el.to_s,
|
46
46
|
tags: %w[div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong],
|
47
47
|
attributes: %w[src href],
|
48
|
-
remove_empty_nodes:
|
48
|
+
remove_empty_nodes: false,
|
49
49
|
)
|
50
50
|
end
|
51
51
|
|
data/spec/lib/textract_spec.rb
CHANGED
@@ -1,4 +1,10 @@
|
|
1
1
|
require_relative '../../lib/textract'
|
2
|
+
|
3
|
+
RSpec.configure do |c|
|
4
|
+
# filter_run is short-form alias for filter_run_including
|
5
|
+
c.filter_run :focus => true
|
6
|
+
end
|
7
|
+
|
2
8
|
describe Textract do
|
3
9
|
it "initializes with the get_text method" do
|
4
10
|
url = "http://www.tedcruz.org/about/"
|
@@ -14,6 +20,13 @@ describe Textract do
|
|
14
20
|
expect(article.author).to eq "Hamilton Nolan"
|
15
21
|
end
|
16
22
|
|
23
|
+
it "also includes images", :focus do
|
24
|
+
url = "http://gawker.com/1696731611"
|
25
|
+
img = "http://i.kinja-img.com/gawker-media/image/upload/s--fWYFlEv6--/c_fit,fl_progressive,q_80,w_636/l3sjlg0ariqomd4ubtl6.jpg"
|
26
|
+
article = Textract.get_text(url)
|
27
|
+
expect(article.text.include?(img)).to be true
|
28
|
+
end
|
29
|
+
|
17
30
|
it "returns article text based on opengraph description" do
|
18
31
|
url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
|
19
32
|
article = Textract.get_text(url)
|