textract 0.0.11 → 0.0.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/textract.rb +3 -1
- data/lib/textract/version.rb +1 -1
- data/spec/fixtures/vcr_cassettes/robots.yml +65 -0
- data/spec/lib/textract_spec.rb +9 -0
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f62f2fb39f7c1fdcb45e476a7b2c9cf98dbd5ece
|
4
|
+
data.tar.gz: 501afc289766b50a687dd66ee1fc76ca92eaf8a6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8ac8ee85911b63bd77ac79c5808e405ab1bb7fa2cfa01a32bf3e9a915e774220fc9bf98818be74a91242ef282ac19e3d069239b788c85f58e026b050424c4c52
|
7
|
+
data.tar.gz: 62aa6256583fc55c9578307ad26d5b18d4a5d01401b28caa38e3dc48bd35230bb8981c9fa2d8a36082ae15670488444f7d28130da063f68ff25e75fa02cd1ee2
|
data/lib/textract.rb
CHANGED
@@ -89,7 +89,9 @@ module Textract
|
|
89
89
|
agent.user_agent_alias = 'Mac Safari'
|
90
90
|
@html = agent.get(url).content
|
91
91
|
@tags = Textract.get_og_tags(@html, url)
|
92
|
-
|
92
|
+
if @tags.url.match(/^(http|ftp)s?:\/\//)
|
93
|
+
@url = @tags.url
|
94
|
+
end
|
93
95
|
|
94
96
|
@article = Textract.smart_extract(@html, @tags.description, selectors)
|
95
97
|
if @article.content.nil?
|
data/lib/textract/version.rb
CHANGED
@@ -0,0 +1,65 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://www.buzzfeed.com/robots.txt
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
Accept-Encoding:
|
11
|
+
- gzip,deflate,identity
|
12
|
+
Accept:
|
13
|
+
- "*/*"
|
14
|
+
User-Agent:
|
15
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML,
|
16
|
+
like Gecko) Version/5.1.1 Safari/534.51.22
|
17
|
+
Accept-Charset:
|
18
|
+
- ISO-8859-1,utf-8;q=0.7,*;q=0.7
|
19
|
+
Accept-Language:
|
20
|
+
- en-us,en;q=0.5
|
21
|
+
Host:
|
22
|
+
- www.buzzfeed.com
|
23
|
+
Connection:
|
24
|
+
- keep-alive
|
25
|
+
Keep-Alive:
|
26
|
+
- 300
|
27
|
+
response:
|
28
|
+
status:
|
29
|
+
code: 200
|
30
|
+
message: OK
|
31
|
+
headers:
|
32
|
+
Accept-Ranges:
|
33
|
+
- bytes
|
34
|
+
Content-Encoding:
|
35
|
+
- gzip
|
36
|
+
Content-Type:
|
37
|
+
- text/plain
|
38
|
+
Etag:
|
39
|
+
- '"65a75-2d4-51364dd1dcb66"'
|
40
|
+
Last-Modified:
|
41
|
+
- Fri, 10 Apr 2015 20:46:50 GMT
|
42
|
+
Server:
|
43
|
+
- Apache
|
44
|
+
Vary:
|
45
|
+
- Accept-Encoding
|
46
|
+
X-Buzzfeed:
|
47
|
+
- webdr02
|
48
|
+
Content-Length:
|
49
|
+
- '252'
|
50
|
+
Date:
|
51
|
+
- Thu, 21 May 2015 18:09:53 GMT
|
52
|
+
Connection:
|
53
|
+
- keep-alive
|
54
|
+
body:
|
55
|
+
encoding: ASCII-8BIT
|
56
|
+
string: !binary |-
|
57
|
+
H4sIAAAAAAAAA7WSwWrDMBBE7/oKHXoyxGpLT4bSQ/MHIWezlrayQLaEdl0l
|
58
|
+
+fo2hIBkTAuFXt/OsjPDHgnTDizO3MmJ5iGweE+Q/c6gh3Mnn54fpdg7Au9D
|
59
|
+
7qRq2tPkH0oyLJfLBtbAaEM6b4ymMDiPqkDNG73e6Ap6Z0cewml98APRqOpc
|
60
|
+
mBmJNxCVLPrFulkJcSxyN/8R50fn8g/WewtVYmJgpytkgMYhQKqruWcuEESn
|
61
|
+
No2tB/13TZ9OY2+Cpro140iH678UcllLDn5Jsf6nFyEOjnGC2MmROXZK5Zzb
|
62
|
+
u4FWh0nRTXAt+nfxjJn6ckOILyPFDovUAgAA
|
63
|
+
http_version:
|
64
|
+
recorded_at: Thu, 21 May 2015 18:09:54 GMT
|
65
|
+
recorded_with: VCR 2.9.3
|
data/spec/lib/textract_spec.rb
CHANGED
@@ -93,4 +93,13 @@ describe Textract do
|
|
93
93
|
end
|
94
94
|
end
|
95
95
|
|
96
|
+
it "handles robots.txt files" do
|
97
|
+
VCR.use_cassette('robots') do
|
98
|
+
url = "http://www.buzzfeed.com/robots.txt"
|
99
|
+
text = Textract.get_text(url)
|
100
|
+
expect(text.to_json).to be_a_kind_of String
|
101
|
+
expect(text.url).to eq url
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
96
105
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Pash
|
@@ -172,6 +172,7 @@ files:
|
|
172
172
|
- spec/fixtures/vcr_cassettes/imgs.yml
|
173
173
|
- spec/fixtures/vcr_cassettes/json.yml
|
174
174
|
- spec/fixtures/vcr_cassettes/og.yml
|
175
|
+
- spec/fixtures/vcr_cassettes/robots.yml
|
175
176
|
- spec/fixtures/vcr_cassettes/selector.yml
|
176
177
|
- spec/fixtures/vcr_cassettes/stackoverflow.yml
|
177
178
|
- spec/lib/textract_spec.rb
|
@@ -209,6 +210,7 @@ test_files:
|
|
209
210
|
- spec/fixtures/vcr_cassettes/imgs.yml
|
210
211
|
- spec/fixtures/vcr_cassettes/json.yml
|
211
212
|
- spec/fixtures/vcr_cassettes/og.yml
|
213
|
+
- spec/fixtures/vcr_cassettes/robots.yml
|
212
214
|
- spec/fixtures/vcr_cassettes/selector.yml
|
213
215
|
- spec/fixtures/vcr_cassettes/stackoverflow.yml
|
214
216
|
- spec/lib/textract_spec.rb
|