textract 0.0.13 → 0.0.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/textract/version.rb +1 -1
- data/lib/textract.rb +18 -1
- data/spec/fixtures/vcr_cassettes/twitter_byline.yml +2277 -0
- data/spec/lib/textract_spec.rb +15 -1
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0336cf0be51c8b9b98137b67d6d58cab27cbde14
|
4
|
+
data.tar.gz: 572f3955c091883512b94071264c89d3dbfba137
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bdd093f12bd58d70275e3cbbdb5fb61a6b7a336afcc7e92697d7ec2d2f3301b36ea9d5be222748b97f4251e4887f84110282737d09b1e8ec4b86ce23718879bd
|
7
|
+
data.tar.gz: 29729f554d34cca2a374c8ad5941d09a0f1ee62c9cfc25902af15cf17c1bec5c45e9e30edb088d6ef135ee33315a8c9e9073adc18acde2bf5aefcf1753d81d71
|
data/lib/textract/version.rb
CHANGED
data/lib/textract.rb
CHANGED
@@ -67,9 +67,26 @@ module Textract
|
|
67
67
|
|
68
68
|
def self.get_author(html)
|
69
69
|
name_meta = Nokogiri::HTML(html).search('meta[name="author"]')
|
70
|
+
if name_meta.empty?
|
71
|
+
name_meta = Nokogiri::HTML(html).search('meta[property="author"]')
|
72
|
+
end
|
70
73
|
name_meta.attribute('content').value unless name_meta.empty?
|
71
74
|
end
|
72
75
|
|
76
|
+
def self.get_twitter(html)
|
77
|
+
twitter_meta = Nokogiri::HTML(html).search('meta[name="twitter:creator"]')
|
78
|
+
twitter_meta.attribute('content').value unless twitter_meta.empty?
|
79
|
+
end
|
80
|
+
|
81
|
+
def self.build_author(article, html)
|
82
|
+
{
|
83
|
+
name: article.author || get_author(html),
|
84
|
+
twitter: get_twitter(html),
|
85
|
+
}
|
86
|
+
end
|
87
|
+
|
88
|
+
# def build_site
|
89
|
+
|
73
90
|
def self.generate_hash(text)
|
74
91
|
Digest::MD5.hexdigest text
|
75
92
|
end
|
@@ -104,7 +121,7 @@ module Textract
|
|
104
121
|
end
|
105
122
|
end
|
106
123
|
@md5 = Textract.generate_hash @text
|
107
|
-
@author = @article
|
124
|
+
@author = Textract.build_author @article, @html
|
108
125
|
@title = @tags.title || Textract.get_page_title(@html)
|
109
126
|
if @url.match(/\/robots.txt$/) and @title = @text
|
110
127
|
@title = @url
|