textract 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/textract/version.rb +1 -1
- data/lib/textract.rb +11 -7
- data/spec/fixtures/vcr_cassettes/bad_frisky.yml +1866 -0
- data/spec/fixtures/vcr_cassettes/cruz.yml +642 -0
- data/spec/fixtures/vcr_cassettes/hamno.yml +632 -0
- data/spec/fixtures/vcr_cassettes/imgs.yml +753 -0
- data/spec/fixtures/vcr_cassettes/json.yml +632 -0
- data/spec/fixtures/vcr_cassettes/og.yml +622 -0
- data/spec/fixtures/vcr_cassettes/selector.yml +684 -0
- data/spec/lib/textract_spec.rb +45 -29
- data/spec/spec_helper.rb +12 -0
- data/textract.gemspec +2 -0
- metadata +46 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b078301e33791b376b44f58325b18b84eaf9740b
|
4
|
+
data.tar.gz: fe260d19a7631f77ba97dc666ee95cc8f94f7304
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 481b7ccc2899cb1d78c1c333943fb0bfd1497ae30052c5e08478b3784a58d83b44134ca8fc9940c292fecd415221727bb39e9d7777da5b2414273fd0b72fdb35
|
7
|
+
data.tar.gz: f1992af9b124c953579e70f58b4dfe2dad797685cc025fba513b9e66c189fdfbfec45efa1f56f147dd4a2cd89b5b14c817e2789853a9e8f4a3a362f8336f6034
|
data/lib/textract/version.rb
CHANGED
data/lib/textract.rb
CHANGED
@@ -16,8 +16,12 @@ module Textract
|
|
16
16
|
@client = Client.new(url, selectors, format)
|
17
17
|
end
|
18
18
|
|
19
|
-
def self.get_og_tags(html)
|
20
|
-
|
19
|
+
def self.get_og_tags(html, url)
|
20
|
+
begin
|
21
|
+
OpenGraph.new(html)
|
22
|
+
rescue
|
23
|
+
OpenGraph.new(url)
|
24
|
+
end
|
21
25
|
end
|
22
26
|
|
23
27
|
def self.smart_extract(html, description, selectors)
|
@@ -47,10 +51,10 @@ module Textract
|
|
47
51
|
article_el = doc
|
48
52
|
end
|
49
53
|
Readability::Document.new(article_el.to_s,
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
+
tags: TAG_WHITELIST,
|
55
|
+
attributes: %w[src href],
|
56
|
+
remove_empty_nodes: false,
|
57
|
+
)
|
54
58
|
end
|
55
59
|
|
56
60
|
def self.get_page_title(html)
|
@@ -81,7 +85,7 @@ module Textract
|
|
81
85
|
agent = Mechanize.new
|
82
86
|
agent.user_agent_alias = 'Mac Safari'
|
83
87
|
@html = agent.get(url).content
|
84
|
-
@tags = Textract.get_og_tags(@html)
|
88
|
+
@tags = Textract.get_og_tags(@html, url)
|
85
89
|
|
86
90
|
@article = Textract.smart_extract(@html, @tags.description, selectors)
|
87
91
|
if @article.content.nil?
|