textract 0.0.15 → 0.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 67ed83a055c856909a5b9e3d5735c640c4310741
4
- data.tar.gz: 4bd80a7d4249935ccc66d79543195e90d6f5d14e
3
+ metadata.gz: 6df62961d3d6f6fb6fb88d189bc657111ea04f65
4
+ data.tar.gz: 400fe86f14b624c7d280719d66b705ac5cea6b41
5
5
  SHA512:
6
- metadata.gz: f57ad83703e136ebdf5d101026a1422b1f84d22d04bedce80db3efff0dc068d67c05d00810ad0b55a8e5dd0c1ce8c60e340771300ae7f2c8f50f7a7389eaaa07
7
- data.tar.gz: 9b9e1a55b948a838e19c11b878b114c18368eab053684552ad9027cef45fc91a67abccd89ed20d2ffd11244c87489004304e1236cb1cd1976afbde32ab63af7f
6
+ metadata.gz: e3797cea30cc5ddc90e672c75b4c700a900c6bbe23bb916b504b2ea2e5ecaef1bc5619d90293c9574b85dea2fc8769d8585b13ae2615ac2bbb9806ffdfab13c5
7
+ data.tar.gz: b3aa7a586c9ab8a679f2d89e0fef6e9847612ab8c945420388e6dec8769ed0dd993158321cb470f9e99e6750f0bab17201795765da1d17c57e72fe5e9c1acdb3
data/lib/textract.rb CHANGED
@@ -78,9 +78,14 @@ module Textract
78
78
  twitter_meta.attribute('content').value unless twitter_meta.empty?
79
79
  end
80
80
 
81
- def self.build_site(html)
81
+ def self.build_site(url, html)
82
82
  site_twitter = Nokogiri::HTML(html).search('meta[name="twitter:site"]')
83
83
  site_name = Nokogiri::HTML(html).search('meta[property="og:site_name"]')
84
+ if site_name.empty?
85
+ site = url.match(/(http|ftp)s?:\/\/((\w+\.)?(\w+\.)(\w+))\//)
86
+ site = site[2] unless site[2].nil?
87
+ site = site.sub(/^www\./, '').capitalize!
88
+ end
84
89
  {
85
90
  name: site_name.empty? ? nil : site_name.attribute('content').value,
86
91
  twitter: site_twitter.empty? ? nil : site_twitter.attribute('content').value,
@@ -131,7 +136,7 @@ module Textract
131
136
  end
132
137
  @md5 = Textract.generate_hash @text
133
138
  @author = Textract.build_author @article, @html
134
- @site = Textract.build_site @html
139
+ @site = Textract.build_site @url, @html
135
140
  @title = @tags.title || Textract.get_page_title(@html)
136
141
  if @url.match(/\/robots.txt$/) and @title = @text
137
142
  @title = @url
@@ -1,3 +1,3 @@
1
1
  module Textract
2
- VERSION = "0.0.15"
2
+ VERSION = "0.0.16"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.15
4
+ version: 0.0.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Pash