textract 0.0.6.3 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/textract/version.rb +1 -1
- data/lib/textract.rb +2 -2
- data/spec/lib/textract_spec.rb +13 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a0d8ffc9fc175116ed6a851cc12a992b4c2eb95b
|
4
|
+
data.tar.gz: d6093955a7cf136b0430246eebd62773f0c36d6f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2ed2d11aa3988cbc81b31b7da07e9d74d09a7f1b057ca450d7fa78d1bd919a4f4dc83caf691a81e9bdb16fd58c848b723a1a27c77f0be3e503d8bce65bc8b5ad
|
7
|
+
data.tar.gz: 377327580500633d4f21ead9adfc610ce7fba65b752525ca5c6879e615e412732b228fd489789f5166f8c1971e5b8ec75207aa722fae96467af41becf67acfda
|
data/lib/textract/version.rb
CHANGED
data/lib/textract.rb
CHANGED
@@ -42,10 +42,10 @@ module Textract
|
|
42
42
|
else
|
43
43
|
article_el = doc
|
44
44
|
end
|
45
|
-
|
45
|
+
Readability::Document.new(article_el.to_s,
|
46
46
|
tags: %w[div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong],
|
47
47
|
attributes: %w[src href],
|
48
|
-
remove_empty_nodes:
|
48
|
+
remove_empty_nodes: false,
|
49
49
|
)
|
50
50
|
end
|
51
51
|
|
data/spec/lib/textract_spec.rb
CHANGED
@@ -1,4 +1,10 @@
|
|
1
1
|
require_relative '../../lib/textract'
|
2
|
+
|
3
|
+
RSpec.configure do |c|
|
4
|
+
# filter_run is short-form alias for filter_run_including
|
5
|
+
c.filter_run :focus => true
|
6
|
+
end
|
7
|
+
|
2
8
|
describe Textract do
|
3
9
|
it "initializes with the get_text method" do
|
4
10
|
url = "http://www.tedcruz.org/about/"
|
@@ -14,6 +20,13 @@ describe Textract do
|
|
14
20
|
expect(article.author).to eq "Hamilton Nolan"
|
15
21
|
end
|
16
22
|
|
23
|
+
it "also includes images", :focus do
|
24
|
+
url = "http://gawker.com/1696731611"
|
25
|
+
img = "http://i.kinja-img.com/gawker-media/image/upload/s--fWYFlEv6--/c_fit,fl_progressive,q_80,w_636/l3sjlg0ariqomd4ubtl6.jpg"
|
26
|
+
article = Textract.get_text(url)
|
27
|
+
expect(article.text.include?(img)).to be true
|
28
|
+
end
|
29
|
+
|
17
30
|
it "returns article text based on opengraph description" do
|
18
31
|
url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
|
19
32
|
article = Textract.get_text(url)
|