textract 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6c0f4e826cede5d69f4c35d74129c33ffe6fd59a
4
- data.tar.gz: 8884dac18c1ddf501ccabbfbde7d5f50c24e37d3
3
+ metadata.gz: 746f24065d2e6b1062f2b4e8c0386957098119a1
4
+ data.tar.gz: b14700f3c1ff7a1cdc5d20d34444cf096516c288
5
5
  SHA512:
6
- metadata.gz: 43d1097a7252e581849883b43c4581757b029e9a8f2e8cca754c396a1ff519cd92e538d984fd163c7733b5a8fdeb1211465cc24a29bffd34918c06d9bce68ce7
7
- data.tar.gz: 8658b5de346c5e47214bcc294627c9d9c2d0bf40446af26192836e8925429dd029c9a91cd75c54d2d3be62db32417d05a85a11c05b0786e80d8c868006b57bbb
6
+ metadata.gz: 92a531364daa3afffc42bad5de554e8c04334191d3584dceffad502273a124d4d447e1b3884a19d3143247386df06e7fde1143a6744f0573be838e52a681d13f
7
+ data.tar.gz: 9df332865ada80fa1fae38562d8c7934916787cc0410cf80909ecdbea8fe8cf72620f397c07aa3b6d82ade4352bce9abc18ed9dd9f51eb53e2d7aacbb7da2bf1
data/lib/textract.rb CHANGED
@@ -21,7 +21,6 @@ module Textract
21
21
  if selectors.nil?
22
22
  article = doc.search('article')
23
23
  else
24
- require 'pry'; binding.pry
25
24
  article = doc.search(selectors)
26
25
  end
27
26
  if article.count == 1
@@ -47,14 +46,16 @@ module Textract
47
46
  tags: %w[div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong],
48
47
  attributes: %w[src href],
49
48
  remove_empty_nodes: true,
50
- ).content
51
- markdown = ReverseMarkdown.convert article, unknown_tags: :bypass
52
- # TODO change to drop once article is supported by reversemarkdown
53
- markdown
49
+ )
54
50
  end
55
51
 
56
52
  def self.get_page_title(html)
57
- Nokogiri::HTML(html).search('title').text
53
+ Nokogiri::HTML(html).search('head').search('title').text
54
+ end
55
+
56
+ def self.get_author(html)
57
+ name_meta = Nokogiri::HTML(html).search('meta[name="author"]')
58
+ name_meta.attribute('content').value unless name_meta.empty?
58
59
  end
59
60
 
60
61
  class Client
@@ -63,6 +64,7 @@ module Textract
63
64
  attr_reader :tags
64
65
  attr_reader :title
65
66
  attr_reader :text
67
+ attr_reader :author
66
68
 
67
69
  def initialize(url, selectors)
68
70
  @url = url
@@ -70,14 +72,11 @@ module Textract
70
72
  agent.user_agent_alias = 'Mac Safari'
71
73
  @html = agent.get(url).content
72
74
  @tags = Textract.get_og_tags(@html)
73
- if @tags.nil? or @tags.description.nil?
74
- # use readability method
75
- @text = Textract.get_text_from_description(@html, nil, selectors)
76
- @title = Textract.get_page_title(@html)
77
- else
78
- @text = Textract.get_text_from_description(@html, @tags.description, selectors)
79
- @title = @tags.title
80
- end
75
+
76
+ @article = Textract.get_text_from_description(@html, @tags.description, selectors)
77
+ @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
78
+ @author = @article.author || Textract.get_author(@html)
79
+ @title = @tags.title || Textract.get_page_title(@html)
81
80
  end
82
81
  end
83
82
  end
@@ -1,3 +1,3 @@
1
1
  module Textract
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.3"
3
3
  end
@@ -2,27 +2,28 @@ require_relative '../../lib/textract'
2
2
  describe Textract do
3
3
  it "initializes with the get_text method" do
4
4
  url = "http://www.tedcruz.org/about/"
5
- textract = Textract.get_text(url)
6
- expect(textract).to be_a_kind_of Textract::Client
5
+ article = Textract.get_text(url)
6
+ expect(article).to be_a_kind_of Textract::Client
7
7
  end
8
8
 
9
9
  it "returns article text based on article tag" do
10
10
  url = "http://gawker.com/1694508525"
11
- textract = Textract.get_text(url)
12
- expect(textract.text[0..5]).to eq "Import"
11
+ article = Textract.get_text(url)
12
+ expect(article.text[0..5]).to eq "Import"
13
+ expect(article.author).to eq "Hamilton Nolan"
13
14
  end
14
15
 
15
16
  it "returns article text based on opengraph description" do
16
17
  url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
17
- textract = Textract.get_text(url)
18
- expect(textract.text[0..5]).to eq "Ted Cr"
18
+ article = Textract.get_text(url)
19
+ expect(article.text[0..5]).to eq "Ted Cr"
19
20
  end
20
21
 
21
22
  it "can find a twitter profile given a selector" do
22
23
  url = "https://twitter.com/lifehacker"
23
- textract = Textract.get_text(url, 'p.ProfileHeaderCard-bio.u-dir')
24
- expect(textract.text.strip).to eq "Don't live to geek; geek to live."
25
- expect(textract.title).to eq "Lifehacker (@lifehacker) | Twitter"
24
+ article = Textract.get_text(url, 'p.ProfileHeaderCard-bio.u-dir')
25
+ expect(article.text.strip).to eq "Don't live to geek; geek to live."
26
+ expect(article.title).to eq "Lifehacker (@lifehacker) | Twitter"
26
27
  end
27
28
 
28
29
  it "gets the page title from the title tag" do
@@ -30,4 +31,9 @@ describe Textract do
30
31
  expect(Textract.get_page_title(html)).to eq "Stuff"
31
32
  end
32
33
 
34
+ it "gets the author from the meta name tag" do
35
+ html = '<html><head><meta name="author" content="Adam Pash"></head><body><h1>FOO!</h1></body></html>'
36
+ expect(Textract.get_author(html)).to eq "Adam Pash"
37
+ end
38
+
33
39
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textract
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Adam Pash