textract 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f61cb21afc706941ebc3f09756dd153f7e6bd4ae
4
- data.tar.gz: 78905e74279b209a7ccce1095b543050f749bb6f
3
+ metadata.gz: b078301e33791b376b44f58325b18b84eaf9740b
4
+ data.tar.gz: fe260d19a7631f77ba97dc666ee95cc8f94f7304
5
5
  SHA512:
6
- metadata.gz: 9197c83b96dda8e79e3c88c08cb2aec64e3fa6b019722df9c92d7e84151d54390df9db18024f7cd5b2c00efeb4b314dafd8e48aa3bda89e31f97211ae122655d
7
- data.tar.gz: c454baf54a7be0032972c4bb20e2ca87526942cf40f35a6bb83025588f299ef03c3dff0e25af278068c14211a10125f083267244bd7f9e28f9d65d3c3150c06a
6
+ metadata.gz: 481b7ccc2899cb1d78c1c333943fb0bfd1497ae30052c5e08478b3784a58d83b44134ca8fc9940c292fecd415221727bb39e9d7777da5b2414273fd0b72fdb35
7
+ data.tar.gz: f1992af9b124c953579e70f58b4dfe2dad797685cc025fba513b9e66c189fdfbfec45efa1f56f147dd4a2cd89b5b14c817e2789853a9e8f4a3a362f8336f6034
@@ -1,3 +1,3 @@
1
1
  module Textract
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.9"
3
3
  end
data/lib/textract.rb CHANGED
@@ -16,8 +16,12 @@ module Textract
16
16
  @client = Client.new(url, selectors, format)
17
17
  end
18
18
 
19
- def self.get_og_tags(html)
20
- OpenGraph.new(html)
19
+ def self.get_og_tags(html, url)
20
+ begin
21
+ OpenGraph.new(html)
22
+ rescue
23
+ OpenGraph.new(url)
24
+ end
21
25
  end
22
26
 
23
27
  def self.smart_extract(html, description, selectors)
@@ -47,10 +51,10 @@ module Textract
47
51
  article_el = doc
48
52
  end
49
53
  Readability::Document.new(article_el.to_s,
50
- tags: TAG_WHITELIST,
51
- attributes: %w[src href],
52
- remove_empty_nodes: false,
53
- )
54
+ tags: TAG_WHITELIST,
55
+ attributes: %w[src href],
56
+ remove_empty_nodes: false,
57
+ )
54
58
  end
55
59
 
56
60
  def self.get_page_title(html)
@@ -81,7 +85,7 @@ module Textract
81
85
  agent = Mechanize.new
82
86
  agent.user_agent_alias = 'Mac Safari'
83
87
  @html = agent.get(url).content
84
- @tags = Textract.get_og_tags(@html)
88
+ @tags = Textract.get_og_tags(@html, url)
85
89
 
86
90
  @article = Textract.smart_extract(@html, @tags.description, selectors)
87
91
  if @article.content.nil?