textract 0.0.11 → 0.0.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/textract.rb +3 -1
- data/lib/textract/version.rb +1 -1
- data/spec/fixtures/vcr_cassettes/robots.yml +65 -0
- data/spec/lib/textract_spec.rb +9 -0
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f62f2fb39f7c1fdcb45e476a7b2c9cf98dbd5ece
|
4
|
+
data.tar.gz: 501afc289766b50a687dd66ee1fc76ca92eaf8a6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8ac8ee85911b63bd77ac79c5808e405ab1bb7fa2cfa01a32bf3e9a915e774220fc9bf98818be74a91242ef282ac19e3d069239b788c85f58e026b050424c4c52
|
7
|
+
data.tar.gz: 62aa6256583fc55c9578307ad26d5b18d4a5d01401b28caa38e3dc48bd35230bb8981c9fa2d8a36082ae15670488444f7d28130da063f68ff25e75fa02cd1ee2
|
data/lib/textract.rb
CHANGED
@@ -89,7 +89,9 @@ module Textract
|
|
89
89
|
agent.user_agent_alias = 'Mac Safari'
|
90
90
|
@html = agent.get(url).content
|
91
91
|
@tags = Textract.get_og_tags(@html, url)
|
92
|
-
|
92
|
+
if @tags.url.match(/^(http|ftp)s?:\/\//)
|
93
|
+
@url = @tags.url
|
94
|
+
end
|
93
95
|
|
94
96
|
@article = Textract.smart_extract(@html, @tags.description, selectors)
|
95
97
|
if @article.content.nil?
|
data/lib/textract/version.rb
CHANGED
@@ -0,0 +1,65 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://www.buzzfeed.com/robots.txt
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
Accept-Encoding:
|
11
|
+
- gzip,deflate,identity
|
12
|
+
Accept:
|
13
|
+
- "*/*"
|
14
|
+
User-Agent:
|
15
|
+
- Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML,
|
16
|
+
like Gecko) Version/5.1.1 Safari/534.51.22
|
17
|
+
Accept-Charset:
|
18
|
+
- ISO-8859-1,utf-8;q=0.7,*;q=0.7
|
19
|
+
Accept-Language:
|
20
|
+
- en-us,en;q=0.5
|
21
|
+
Host:
|
22
|
+
- www.buzzfeed.com
|
23
|
+
Connection:
|
24
|
+
- keep-alive
|
25
|
+
Keep-Alive:
|
26
|
+
- 300
|
27
|
+
response:
|
28
|
+
status:
|
29
|
+
code: 200
|
30
|
+
message: OK
|
31
|
+
headers:
|
32
|
+
Accept-Ranges:
|
33
|
+
- bytes
|
34
|
+
Content-Encoding:
|
35
|
+
- gzip
|
36
|
+
Content-Type:
|
37
|
+
- text/plain
|
38
|
+
Etag:
|
39
|
+
- '"65a75-2d4-51364dd1dcb66"'
|
40
|
+
Last-Modified:
|
41
|
+
- Fri, 10 Apr 2015 20:46:50 GMT
|
42
|
+
Server:
|
43
|
+
- Apache
|
44
|
+
Vary:
|
45
|
+
- Accept-Encoding
|
46
|
+
X-Buzzfeed:
|
47
|
+
- webdr02
|
48
|
+
Content-Length:
|
49
|
+
- '252'
|
50
|
+
Date:
|
51
|
+
- Thu, 21 May 2015 18:09:53 GMT
|
52
|
+
Connection:
|
53
|
+
- keep-alive
|
54
|
+
body:
|
55
|
+
encoding: ASCII-8BIT
|
56
|
+
string: !binary |-
|
57
|
+
H4sIAAAAAAAAA7WSwWrDMBBE7/oKHXoyxGpLT4bSQ/MHIWezlrayQLaEdl0l
|
58
|
+
+fo2hIBkTAuFXt/OsjPDHgnTDizO3MmJ5iGweE+Q/c6gh3Mnn54fpdg7Au9D
|
59
|
+
7qRq2tPkH0oyLJfLBtbAaEM6b4ymMDiPqkDNG73e6Ap6Z0cewml98APRqOpc
|
60
|
+
mBmJNxCVLPrFulkJcSxyN/8R50fn8g/WewtVYmJgpytkgMYhQKqruWcuEESn
|
61
|
+
No2tB/13TZ9OY2+Cpro140iH678UcllLDn5Jsf6nFyEOjnGC2MmROXZK5Zzb
|
62
|
+
u4FWh0nRTXAt+nfxjJn6ckOILyPFDovUAgAA
|
63
|
+
http_version:
|
64
|
+
recorded_at: Thu, 21 May 2015 18:09:54 GMT
|
65
|
+
recorded_with: VCR 2.9.3
|
data/spec/lib/textract_spec.rb
CHANGED
@@ -93,4 +93,13 @@ describe Textract do
|
|
93
93
|
end
|
94
94
|
end
|
95
95
|
|
96
|
+
it "handles robots.txt files" do
|
97
|
+
VCR.use_cassette('robots') do
|
98
|
+
url = "http://www.buzzfeed.com/robots.txt"
|
99
|
+
text = Textract.get_text(url)
|
100
|
+
expect(text.to_json).to be_a_kind_of String
|
101
|
+
expect(text.url).to eq url
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
96
105
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Pash
|
@@ -172,6 +172,7 @@ files:
|
|
172
172
|
- spec/fixtures/vcr_cassettes/imgs.yml
|
173
173
|
- spec/fixtures/vcr_cassettes/json.yml
|
174
174
|
- spec/fixtures/vcr_cassettes/og.yml
|
175
|
+
- spec/fixtures/vcr_cassettes/robots.yml
|
175
176
|
- spec/fixtures/vcr_cassettes/selector.yml
|
176
177
|
- spec/fixtures/vcr_cassettes/stackoverflow.yml
|
177
178
|
- spec/lib/textract_spec.rb
|
@@ -209,6 +210,7 @@ test_files:
|
|
209
210
|
- spec/fixtures/vcr_cassettes/imgs.yml
|
210
211
|
- spec/fixtures/vcr_cassettes/json.yml
|
211
212
|
- spec/fixtures/vcr_cassettes/og.yml
|
213
|
+
- spec/fixtures/vcr_cassettes/robots.yml
|
212
214
|
- spec/fixtures/vcr_cassettes/selector.yml
|
213
215
|
- spec/fixtures/vcr_cassettes/stackoverflow.yml
|
214
216
|
- spec/lib/textract_spec.rb
|