textract 0.0.8 → 0.0.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f61cb21afc706941ebc3f09756dd153f7e6bd4ae
4
- data.tar.gz: 78905e74279b209a7ccce1095b543050f749bb6f
3
+ metadata.gz: b078301e33791b376b44f58325b18b84eaf9740b
4
+ data.tar.gz: fe260d19a7631f77ba97dc666ee95cc8f94f7304
5
5
  SHA512:
6
- metadata.gz: 9197c83b96dda8e79e3c88c08cb2aec64e3fa6b019722df9c92d7e84151d54390df9db18024f7cd5b2c00efeb4b314dafd8e48aa3bda89e31f97211ae122655d
7
- data.tar.gz: c454baf54a7be0032972c4bb20e2ca87526942cf40f35a6bb83025588f299ef03c3dff0e25af278068c14211a10125f083267244bd7f9e28f9d65d3c3150c06a
6
+ metadata.gz: 481b7ccc2899cb1d78c1c333943fb0bfd1497ae30052c5e08478b3784a58d83b44134ca8fc9940c292fecd415221727bb39e9d7777da5b2414273fd0b72fdb35
7
+ data.tar.gz: f1992af9b124c953579e70f58b4dfe2dad797685cc025fba513b9e66c189fdfbfec45efa1f56f147dd4a2cd89b5b14c817e2789853a9e8f4a3a362f8336f6034
@@ -1,3 +1,3 @@
1
1
  module Textract
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.9"
3
3
  end
data/lib/textract.rb CHANGED
@@ -16,8 +16,12 @@ module Textract
16
16
  @client = Client.new(url, selectors, format)
17
17
  end
18
18
 
19
- def self.get_og_tags(html)
20
- OpenGraph.new(html)
19
+ def self.get_og_tags(html, url)
20
+ begin
21
+ OpenGraph.new(html)
22
+ rescue
23
+ OpenGraph.new(url)
24
+ end
21
25
  end
22
26
 
23
27
  def self.smart_extract(html, description, selectors)
@@ -47,10 +51,10 @@ module Textract
47
51
  article_el = doc
48
52
  end
49
53
  Readability::Document.new(article_el.to_s,
50
- tags: TAG_WHITELIST,
51
- attributes: %w[src href],
52
- remove_empty_nodes: false,
53
- )
54
+ tags: TAG_WHITELIST,
55
+ attributes: %w[src href],
56
+ remove_empty_nodes: false,
57
+ )
54
58
  end
55
59
 
56
60
  def self.get_page_title(html)
@@ -81,7 +85,7 @@ module Textract
81
85
  agent = Mechanize.new
82
86
  agent.user_agent_alias = 'Mac Safari'
83
87
  @html = agent.get(url).content
84
- @tags = Textract.get_og_tags(@html)
88
+ @tags = Textract.get_og_tags(@html, url)
85
89
 
86
90
  @article = Textract.smart_extract(@html, @tags.description, selectors)
87
91
  if @article.content.nil?