textract 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/textract.rb +13 -14
- data/lib/textract/version.rb +1 -1
- data/spec/lib/textract_spec.rb +15 -9
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 746f24065d2e6b1062f2b4e8c0386957098119a1
|
4
|
+
data.tar.gz: b14700f3c1ff7a1cdc5d20d34444cf096516c288
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 92a531364daa3afffc42bad5de554e8c04334191d3584dceffad502273a124d4d447e1b3884a19d3143247386df06e7fde1143a6744f0573be838e52a681d13f
|
7
|
+
data.tar.gz: 9df332865ada80fa1fae38562d8c7934916787cc0410cf80909ecdbea8fe8cf72620f397c07aa3b6d82ade4352bce9abc18ed9dd9f51eb53e2d7aacbb7da2bf1
|
data/lib/textract.rb
CHANGED
@@ -21,7 +21,6 @@ module Textract
|
|
21
21
|
if selectors.nil?
|
22
22
|
article = doc.search('article')
|
23
23
|
else
|
24
|
-
require 'pry'; binding.pry
|
25
24
|
article = doc.search(selectors)
|
26
25
|
end
|
27
26
|
if article.count == 1
|
@@ -47,14 +46,16 @@ module Textract
|
|
47
46
|
tags: %w[div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong],
|
48
47
|
attributes: %w[src href],
|
49
48
|
remove_empty_nodes: true,
|
50
|
-
)
|
51
|
-
markdown = ReverseMarkdown.convert article, unknown_tags: :bypass
|
52
|
-
# TODO change to drop once article is supported by reversemarkdown
|
53
|
-
markdown
|
49
|
+
)
|
54
50
|
end
|
55
51
|
|
56
52
|
def self.get_page_title(html)
|
57
|
-
Nokogiri::HTML(html).search('title').text
|
53
|
+
Nokogiri::HTML(html).search('head').search('title').text
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.get_author(html)
|
57
|
+
name_meta = Nokogiri::HTML(html).search('meta[name="author"]')
|
58
|
+
name_meta.attribute('content').value unless name_meta.empty?
|
58
59
|
end
|
59
60
|
|
60
61
|
class Client
|
@@ -63,6 +64,7 @@ module Textract
|
|
63
64
|
attr_reader :tags
|
64
65
|
attr_reader :title
|
65
66
|
attr_reader :text
|
67
|
+
attr_reader :author
|
66
68
|
|
67
69
|
def initialize(url, selectors)
|
68
70
|
@url = url
|
@@ -70,14 +72,11 @@ module Textract
|
|
70
72
|
agent.user_agent_alias = 'Mac Safari'
|
71
73
|
@html = agent.get(url).content
|
72
74
|
@tags = Textract.get_og_tags(@html)
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
@text = Textract.get_text_from_description(@html, @tags.description, selectors)
|
79
|
-
@title = @tags.title
|
80
|
-
end
|
75
|
+
|
76
|
+
@article = Textract.get_text_from_description(@html, @tags.description, selectors)
|
77
|
+
@text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
|
78
|
+
@author = @article.author || Textract.get_author(@html)
|
79
|
+
@title = @tags.title || Textract.get_page_title(@html)
|
81
80
|
end
|
82
81
|
end
|
83
82
|
end
|
data/lib/textract/version.rb
CHANGED
data/spec/lib/textract_spec.rb
CHANGED
@@ -2,27 +2,28 @@ require_relative '../../lib/textract'
|
|
2
2
|
describe Textract do
|
3
3
|
it "initializes with the get_text method" do
|
4
4
|
url = "http://www.tedcruz.org/about/"
|
5
|
-
|
6
|
-
expect(
|
5
|
+
article = Textract.get_text(url)
|
6
|
+
expect(article).to be_a_kind_of Textract::Client
|
7
7
|
end
|
8
8
|
|
9
9
|
it "returns article text based on article tag" do
|
10
10
|
url = "http://gawker.com/1694508525"
|
11
|
-
|
12
|
-
expect(
|
11
|
+
article = Textract.get_text(url)
|
12
|
+
expect(article.text[0..5]).to eq "Import"
|
13
|
+
expect(article.author).to eq "Hamilton Nolan"
|
13
14
|
end
|
14
15
|
|
15
16
|
it "returns article text based on opengraph description" do
|
16
17
|
url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
|
17
|
-
|
18
|
-
expect(
|
18
|
+
article = Textract.get_text(url)
|
19
|
+
expect(article.text[0..5]).to eq "Ted Cr"
|
19
20
|
end
|
20
21
|
|
21
22
|
it "can find a twitter profile given a selector" do
|
22
23
|
url = "https://twitter.com/lifehacker"
|
23
|
-
|
24
|
-
expect(
|
25
|
-
expect(
|
24
|
+
article = Textract.get_text(url, 'p.ProfileHeaderCard-bio.u-dir')
|
25
|
+
expect(article.text.strip).to eq "Don't live to geek; geek to live."
|
26
|
+
expect(article.title).to eq "Lifehacker (@lifehacker) | Twitter"
|
26
27
|
end
|
27
28
|
|
28
29
|
it "gets the page title from the title tag" do
|
@@ -30,4 +31,9 @@ describe Textract do
|
|
30
31
|
expect(Textract.get_page_title(html)).to eq "Stuff"
|
31
32
|
end
|
32
33
|
|
34
|
+
it "gets the author from the meta name tag" do
|
35
|
+
html = '<html><head><meta name="author" content="Adam Pash"></head><body><h1>FOO!</h1></body></html>'
|
36
|
+
expect(Textract.get_author(html)).to eq "Adam Pash"
|
37
|
+
end
|
38
|
+
|
33
39
|
end
|