textract 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/textract/version.rb +1 -1
- data/lib/textract.rb +11 -7
- data/spec/fixtures/vcr_cassettes/bad_frisky.yml +1866 -0
- data/spec/fixtures/vcr_cassettes/cruz.yml +642 -0
- data/spec/fixtures/vcr_cassettes/hamno.yml +632 -0
- data/spec/fixtures/vcr_cassettes/imgs.yml +753 -0
- data/spec/fixtures/vcr_cassettes/json.yml +632 -0
- data/spec/fixtures/vcr_cassettes/og.yml +622 -0
- data/spec/fixtures/vcr_cassettes/selector.yml +684 -0
- data/spec/lib/textract_spec.rb +45 -29
- data/spec/spec_helper.rb +12 -0
- data/textract.gemspec +2 -0
- metadata +46 -2
data/spec/lib/textract_spec.rb
CHANGED
@@ -1,43 +1,49 @@
|
|
1
|
-
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'textract'
|
2
3
|
|
3
|
-
|
4
|
-
# filter_run is short-form alias for filter_run_including
|
5
|
-
c.filter_run :focus => true
|
6
|
-
end
|
7
|
-
|
8
|
-
describe Textract, :focus do
|
4
|
+
describe Textract do
|
9
5
|
it "initializes with the get_text method" do
|
10
|
-
|
11
|
-
|
12
|
-
|
6
|
+
VCR.use_cassette("cruz") do
|
7
|
+
url = "http://www.tedcruz.org/about/"
|
8
|
+
article = Textract.get_text(url)
|
9
|
+
expect(article).to be_a_kind_of Textract::Client
|
10
|
+
end
|
13
11
|
end
|
14
12
|
|
15
13
|
it "returns article text based on article tag" do
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
14
|
+
VCR.use_cassette("hamno") do
|
15
|
+
url = "http://gawker.com/1694508525"
|
16
|
+
article = Textract.get_text(url)
|
17
|
+
expect(article.text.include?("Import")).to eq true
|
18
|
+
expect(article.md5).to eq "c11a810a3e73f24aac78fd3e39e69f87"
|
19
|
+
expect(article.author).to eq "Hamilton Nolan"
|
20
|
+
end
|
21
21
|
end
|
22
22
|
|
23
23
|
it "also includes images" do
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
24
|
+
VCR.use_cassette('imgs') do
|
25
|
+
url = "http://gawker.com/1696731611"
|
26
|
+
img = "http://i.kinja-img.com/gawker-media/image/upload/s--fWYFlEv6--/c_fit,fl_progressive,q_80,w_636/l3sjlg0ariqomd4ubtl6.jpg"
|
27
|
+
article = Textract.get_text(url)
|
28
|
+
expect(article.text.include?(img)).to be true
|
29
|
+
end
|
28
30
|
end
|
29
31
|
|
30
32
|
it "returns article text based on opengraph description" do
|
31
|
-
|
32
|
-
|
33
|
-
|
33
|
+
VCR.use_cassette('og') do
|
34
|
+
url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
|
35
|
+
article = Textract.get_text(url)
|
36
|
+
expect(article.text.include?("Ted Cruz")).to eq true
|
37
|
+
end
|
34
38
|
end
|
35
39
|
|
36
40
|
it "can find a twitter profile given a selector" do
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
+
VCR.use_cassette('selector') do
|
42
|
+
url = "https://twitter.com/lifehacker"
|
43
|
+
article = Textract.get_text(url, 'p.ProfileHeaderCard-bio.u-dir')
|
44
|
+
expect(article.text.strip).to eq "Don't live to geek; geek to live."
|
45
|
+
expect(article.title).to eq "Lifehacker (@lifehacker) | Twitter"
|
46
|
+
end
|
41
47
|
end
|
42
48
|
|
43
49
|
it "gets the page title from the title tag" do
|
@@ -51,9 +57,19 @@ describe Textract, :focus do
|
|
51
57
|
end
|
52
58
|
|
53
59
|
it "converts itself to json" do
|
54
|
-
|
55
|
-
|
56
|
-
|
60
|
+
VCR.use_cassette('json') do
|
61
|
+
url = "http://gawker.com/1694508525"
|
62
|
+
article = Textract.get_text(url)
|
63
|
+
expect(article.to_json).to be_a_kind_of String
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
it "handles problem urls" do
|
68
|
+
VCR.use_cassette('bad frisky') do
|
69
|
+
url = "http://www.thefrisky.com/2015-04-22/10-things-i-was-irrationally-jealous-of-in-high-school-and-admittedly-still-am/"
|
70
|
+
article = Textract.get_text(url)
|
71
|
+
expect(article.to_json).to be_a_kind_of String
|
72
|
+
end
|
57
73
|
end
|
58
74
|
|
59
75
|
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
RSpec.configure do |c|
|
2
|
+
c.filter_run_including :focus => true
|
3
|
+
c.run_all_when_everything_filtered = true
|
4
|
+
end
|
5
|
+
|
6
|
+
require 'vcr'
|
7
|
+
VCR.configure do |config|
|
8
|
+
config.cassette_library_dir = "spec/fixtures/vcr_cassettes"
|
9
|
+
config.hook_into :webmock
|
10
|
+
config.allow_http_connections_when_no_cassette = true
|
11
|
+
end
|
12
|
+
|
data/textract.gemspec
CHANGED
@@ -27,4 +27,6 @@ Gem::Specification.new do |spec|
|
|
27
27
|
spec.add_development_dependency "rake", "~> 10.0"
|
28
28
|
spec.add_development_dependency "rspec"
|
29
29
|
spec.add_development_dependency "guard-rspec"
|
30
|
+
spec.add_development_dependency "vcr"
|
31
|
+
spec.add_development_dependency "webmock"
|
30
32
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textract
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Pash
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: opengraph_parser
|
@@ -122,6 +122,34 @@ dependencies:
|
|
122
122
|
- - ">="
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
126
|
+
name: vcr
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
128
|
+
requirements:
|
129
|
+
- - ">="
|
130
|
+
- !ruby/object:Gem::Version
|
131
|
+
version: '0'
|
132
|
+
type: :development
|
133
|
+
prerelease: false
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: webmock
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - ">="
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: '0'
|
146
|
+
type: :development
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: '0'
|
125
153
|
description: Extracts article text from a URL
|
126
154
|
email:
|
127
155
|
- adam.pash@gmail.com
|
@@ -137,7 +165,15 @@ files:
|
|
137
165
|
- Rakefile
|
138
166
|
- lib/textract.rb
|
139
167
|
- lib/textract/version.rb
|
168
|
+
- spec/fixtures/vcr_cassettes/bad_frisky.yml
|
169
|
+
- spec/fixtures/vcr_cassettes/cruz.yml
|
170
|
+
- spec/fixtures/vcr_cassettes/hamno.yml
|
171
|
+
- spec/fixtures/vcr_cassettes/imgs.yml
|
172
|
+
- spec/fixtures/vcr_cassettes/json.yml
|
173
|
+
- spec/fixtures/vcr_cassettes/og.yml
|
174
|
+
- spec/fixtures/vcr_cassettes/selector.yml
|
140
175
|
- spec/lib/textract_spec.rb
|
176
|
+
- spec/spec_helper.rb
|
141
177
|
- textract.gemspec
|
142
178
|
homepage: ''
|
143
179
|
licenses:
|
@@ -164,4 +200,12 @@ signing_key:
|
|
164
200
|
specification_version: 4
|
165
201
|
summary: Extracts article text from a URL
|
166
202
|
test_files:
|
203
|
+
- spec/fixtures/vcr_cassettes/bad_frisky.yml
|
204
|
+
- spec/fixtures/vcr_cassettes/cruz.yml
|
205
|
+
- spec/fixtures/vcr_cassettes/hamno.yml
|
206
|
+
- spec/fixtures/vcr_cassettes/imgs.yml
|
207
|
+
- spec/fixtures/vcr_cassettes/json.yml
|
208
|
+
- spec/fixtures/vcr_cassettes/og.yml
|
209
|
+
- spec/fixtures/vcr_cassettes/selector.yml
|
167
210
|
- spec/lib/textract_spec.rb
|
211
|
+
- spec/spec_helper.rb
|