textract 0.0.11 → 0.0.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 08b1fc812dcefcee0078061f54823d9f026b5359
4
- data.tar.gz: 066edc9db9c5b3f3806dd04d7b5aca27b876b76f
3
+ metadata.gz: f62f2fb39f7c1fdcb45e476a7b2c9cf98dbd5ece
4
+ data.tar.gz: 501afc289766b50a687dd66ee1fc76ca92eaf8a6
5
5
  SHA512:
6
- metadata.gz: 7c53f59822db82c833fda369f4d1f976b4bfe386a78453145591eb1fa58b279a805ad53ac6077e4c67f2963596e83324e6bcbfcd65054da3fe1cbce95301c979
7
- data.tar.gz: 5f3ea744f2de5a51ae5e172a3d943d6c747d3fbf9eac262ea9054265b7a767b56dd18e779d4250c12ad9d724ead604d0dad962c4cd27789ea0d427404d5005b0
6
+ metadata.gz: 8ac8ee85911b63bd77ac79c5808e405ab1bb7fa2cfa01a32bf3e9a915e774220fc9bf98818be74a91242ef282ac19e3d069239b788c85f58e026b050424c4c52
7
+ data.tar.gz: 62aa6256583fc55c9578307ad26d5b18d4a5d01401b28caa38e3dc48bd35230bb8981c9fa2d8a36082ae15670488444f7d28130da063f68ff25e75fa02cd1ee2
@@ -89,7 +89,9 @@ module Textract
89
89
  agent.user_agent_alias = 'Mac Safari'
90
90
  @html = agent.get(url).content
91
91
  @tags = Textract.get_og_tags(@html, url)
92
- @url = @tags.url || @url
92
+ if @tags.url.match(/^(http|ftp)s?:\/\//)
93
+ @url = @tags.url
94
+ end
93
95
 
94
96
  @article = Textract.smart_extract(@html, @tags.description, selectors)
95
97
  if @article.content.nil?
@@ -1,3 +1,3 @@
1
1
  module Textract
2
- VERSION = "0.0.11"
2
+ VERSION = "0.0.12"
3
3
  end
@@ -0,0 +1,65 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://www.buzzfeed.com/robots.txt
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ Accept-Encoding:
11
+ - gzip,deflate,identity
12
+ Accept:
13
+ - "*/*"
14
+ User-Agent:
15
+ - Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML,
16
+ like Gecko) Version/5.1.1 Safari/534.51.22
17
+ Accept-Charset:
18
+ - ISO-8859-1,utf-8;q=0.7,*;q=0.7
19
+ Accept-Language:
20
+ - en-us,en;q=0.5
21
+ Host:
22
+ - www.buzzfeed.com
23
+ Connection:
24
+ - keep-alive
25
+ Keep-Alive:
26
+ - 300
27
+ response:
28
+ status:
29
+ code: 200
30
+ message: OK
31
+ headers:
32
+ Accept-Ranges:
33
+ - bytes
34
+ Content-Encoding:
35
+ - gzip
36
+ Content-Type:
37
+ - text/plain
38
+ Etag:
39
+ - '"65a75-2d4-51364dd1dcb66"'
40
+ Last-Modified:
41
+ - Fri, 10 Apr 2015 20:46:50 GMT
42
+ Server:
43
+ - Apache
44
+ Vary:
45
+ - Accept-Encoding
46
+ X-Buzzfeed:
47
+ - webdr02
48
+ Content-Length:
49
+ - '252'
50
+ Date:
51
+ - Thu, 21 May 2015 18:09:53 GMT
52
+ Connection:
53
+ - keep-alive
54
+ body:
55
+ encoding: ASCII-8BIT
56
+ string: !binary |-
57
+ H4sIAAAAAAAAA7WSwWrDMBBE7/oKHXoyxGpLT4bSQ/MHIWezlrayQLaEdl0l
58
+ +fo2hIBkTAuFXt/OsjPDHgnTDizO3MmJ5iGweE+Q/c6gh3Mnn54fpdg7Au9D
59
+ 7qRq2tPkH0oyLJfLBtbAaEM6b4ymMDiPqkDNG73e6Ap6Z0cewml98APRqOpc
60
+ mBmJNxCVLPrFulkJcSxyN/8R50fn8g/WewtVYmJgpytkgMYhQKqruWcuEESn
61
+ No2tB/13TZ9OY2+Cpro140iH678UcllLDn5Jsf6nFyEOjnGC2MmROXZK5Zzb
62
+ u4FWh0nRTXAt+nfxjJn6ckOILyPFDovUAgAA
63
+ http_version:
64
+ recorded_at: Thu, 21 May 2015 18:09:54 GMT
65
+ recorded_with: VCR 2.9.3
@@ -93,4 +93,13 @@ describe Textract do
93
93
  end
94
94
  end
95
95
 
96
+ it "handles robots.txt files" do
97
+ VCR.use_cassette('robots') do
98
+ url = "http://www.buzzfeed.com/robots.txt"
99
+ text = Textract.get_text(url)
100
+ expect(text.to_json).to be_a_kind_of String
101
+ expect(text.url).to eq url
102
+ end
103
+ end
104
+
96
105
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.11
4
+ version: 0.0.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Pash
@@ -172,6 +172,7 @@ files:
172
172
  - spec/fixtures/vcr_cassettes/imgs.yml
173
173
  - spec/fixtures/vcr_cassettes/json.yml
174
174
  - spec/fixtures/vcr_cassettes/og.yml
175
+ - spec/fixtures/vcr_cassettes/robots.yml
175
176
  - spec/fixtures/vcr_cassettes/selector.yml
176
177
  - spec/fixtures/vcr_cassettes/stackoverflow.yml
177
178
  - spec/lib/textract_spec.rb
@@ -209,6 +210,7 @@ test_files:
209
210
  - spec/fixtures/vcr_cassettes/imgs.yml
210
211
  - spec/fixtures/vcr_cassettes/json.yml
211
212
  - spec/fixtures/vcr_cassettes/og.yml
213
+ - spec/fixtures/vcr_cassettes/robots.yml
212
214
  - spec/fixtures/vcr_cassettes/selector.yml
213
215
  - spec/fixtures/vcr_cassettes/stackoverflow.yml
214
216
  - spec/lib/textract_spec.rb