textract 0.0.11 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 08b1fc812dcefcee0078061f54823d9f026b5359
4
- data.tar.gz: 066edc9db9c5b3f3806dd04d7b5aca27b876b76f
3
+ metadata.gz: f62f2fb39f7c1fdcb45e476a7b2c9cf98dbd5ece
4
+ data.tar.gz: 501afc289766b50a687dd66ee1fc76ca92eaf8a6
5
5
  SHA512:
6
- metadata.gz: 7c53f59822db82c833fda369f4d1f976b4bfe386a78453145591eb1fa58b279a805ad53ac6077e4c67f2963596e83324e6bcbfcd65054da3fe1cbce95301c979
7
- data.tar.gz: 5f3ea744f2de5a51ae5e172a3d943d6c747d3fbf9eac262ea9054265b7a767b56dd18e779d4250c12ad9d724ead604d0dad962c4cd27789ea0d427404d5005b0
6
+ metadata.gz: 8ac8ee85911b63bd77ac79c5808e405ab1bb7fa2cfa01a32bf3e9a915e774220fc9bf98818be74a91242ef282ac19e3d069239b788c85f58e026b050424c4c52
7
+ data.tar.gz: 62aa6256583fc55c9578307ad26d5b18d4a5d01401b28caa38e3dc48bd35230bb8981c9fa2d8a36082ae15670488444f7d28130da063f68ff25e75fa02cd1ee2
@@ -89,7 +89,9 @@ module Textract
89
89
  agent.user_agent_alias = 'Mac Safari'
90
90
  @html = agent.get(url).content
91
91
  @tags = Textract.get_og_tags(@html, url)
92
- @url = @tags.url || @url
92
+ if @tags.url.match(/^(http|ftp)s?:\/\//)
93
+ @url = @tags.url
94
+ end
93
95
 
94
96
  @article = Textract.smart_extract(@html, @tags.description, selectors)
95
97
  if @article.content.nil?
@@ -1,3 +1,3 @@
1
1
  module Textract
2
- VERSION = "0.0.11"
2
+ VERSION = "0.0.12"
3
3
  end
@@ -0,0 +1,65 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://www.buzzfeed.com/robots.txt
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ Accept-Encoding:
11
+ - gzip,deflate,identity
12
+ Accept:
13
+ - "*/*"
14
+ User-Agent:
15
+ - Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML,
16
+ like Gecko) Version/5.1.1 Safari/534.51.22
17
+ Accept-Charset:
18
+ - ISO-8859-1,utf-8;q=0.7,*;q=0.7
19
+ Accept-Language:
20
+ - en-us,en;q=0.5
21
+ Host:
22
+ - www.buzzfeed.com
23
+ Connection:
24
+ - keep-alive
25
+ Keep-Alive:
26
+ - 300
27
+ response:
28
+ status:
29
+ code: 200
30
+ message: OK
31
+ headers:
32
+ Accept-Ranges:
33
+ - bytes
34
+ Content-Encoding:
35
+ - gzip
36
+ Content-Type:
37
+ - text/plain
38
+ Etag:
39
+ - '"65a75-2d4-51364dd1dcb66"'
40
+ Last-Modified:
41
+ - Fri, 10 Apr 2015 20:46:50 GMT
42
+ Server:
43
+ - Apache
44
+ Vary:
45
+ - Accept-Encoding
46
+ X-Buzzfeed:
47
+ - webdr02
48
+ Content-Length:
49
+ - '252'
50
+ Date:
51
+ - Thu, 21 May 2015 18:09:53 GMT
52
+ Connection:
53
+ - keep-alive
54
+ body:
55
+ encoding: ASCII-8BIT
56
+ string: !binary |-
57
+ H4sIAAAAAAAAA7WSwWrDMBBE7/oKHXoyxGpLT4bSQ/MHIWezlrayQLaEdl0l
58
+ +fo2hIBkTAuFXt/OsjPDHgnTDizO3MmJ5iGweE+Q/c6gh3Mnn54fpdg7Au9D
59
+ 7qRq2tPkH0oyLJfLBtbAaEM6b4ymMDiPqkDNG73e6Ap6Z0cewml98APRqOpc
60
+ mBmJNxCVLPrFulkJcSxyN/8R50fn8g/WewtVYmJgpytkgMYhQKqruWcuEESn
61
+ No2tB/13TZ9OY2+Cpro140iH678UcllLDn5Jsf6nFyEOjnGC2MmROXZK5Zzb
62
+ u4FWh0nRTXAt+nfxjJn6ckOILyPFDovUAgAA
63
+ http_version:
64
+ recorded_at: Thu, 21 May 2015 18:09:54 GMT
65
+ recorded_with: VCR 2.9.3
@@ -93,4 +93,13 @@ describe Textract do
93
93
  end
94
94
  end
95
95
 
96
+ it "handles robots.txt files" do
97
+ VCR.use_cassette('robots') do
98
+ url = "http://www.buzzfeed.com/robots.txt"
99
+ text = Textract.get_text(url)
100
+ expect(text.to_json).to be_a_kind_of String
101
+ expect(text.url).to eq url
102
+ end
103
+ end
104
+
96
105
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.11
4
+ version: 0.0.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Pash
@@ -172,6 +172,7 @@ files:
172
172
  - spec/fixtures/vcr_cassettes/imgs.yml
173
173
  - spec/fixtures/vcr_cassettes/json.yml
174
174
  - spec/fixtures/vcr_cassettes/og.yml
175
+ - spec/fixtures/vcr_cassettes/robots.yml
175
176
  - spec/fixtures/vcr_cassettes/selector.yml
176
177
  - spec/fixtures/vcr_cassettes/stackoverflow.yml
177
178
  - spec/lib/textract_spec.rb
@@ -209,6 +210,7 @@ test_files:
209
210
  - spec/fixtures/vcr_cassettes/imgs.yml
210
211
  - spec/fixtures/vcr_cassettes/json.yml
211
212
  - spec/fixtures/vcr_cassettes/og.yml
213
+ - spec/fixtures/vcr_cassettes/robots.yml
212
214
  - spec/fixtures/vcr_cassettes/selector.yml
213
215
  - spec/fixtures/vcr_cassettes/stackoverflow.yml
214
216
  - spec/lib/textract_spec.rb