RubyGems - textract - Versions diffs - 0.0.2 → 0.0.3 - Mend

textract 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 6c0f4e826cede5d69f4c35d74129c33ffe6fd59a
-  data.tar.gz: 8884dac18c1ddf501ccabbfbde7d5f50c24e37d3
+  metadata.gz: 746f24065d2e6b1062f2b4e8c0386957098119a1
+  data.tar.gz: b14700f3c1ff7a1cdc5d20d34444cf096516c288
 SHA512:
-  metadata.gz: 43d1097a7252e581849883b43c4581757b029e9a8f2e8cca754c396a1ff519cd92e538d984fd163c7733b5a8fdeb1211465cc24a29bffd34918c06d9bce68ce7
-  data.tar.gz: 8658b5de346c5e47214bcc294627c9d9c2d0bf40446af26192836e8925429dd029c9a91cd75c54d2d3be62db32417d05a85a11c05b0786e80d8c868006b57bbb
+  metadata.gz: 92a531364daa3afffc42bad5de554e8c04334191d3584dceffad502273a124d4d447e1b3884a19d3143247386df06e7fde1143a6744f0573be838e52a681d13f
+  data.tar.gz: 9df332865ada80fa1fae38562d8c7934916787cc0410cf80909ecdbea8fe8cf72620f397c07aa3b6d82ade4352bce9abc18ed9dd9f51eb53e2d7aacbb7da2bf1

data/lib/textract.rb CHANGED Viewed

@@ -21,7 +21,6 @@ module Textract
     if selectors.nil?
       article = doc.search('article')
     else
-      require 'pry'; binding.pry
       article = doc.search(selectors)
     end
     if article.count == 1
@@ -47,14 +46,16 @@ module Textract
                                         tags: %w[div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong],
                                         attributes: %w[src href],
                                         remove_empty_nodes: true,
-                                       ).content
-    markdown = ReverseMarkdown.convert article, unknown_tags: :bypass
-    # TODO change to drop once article is supported by reversemarkdown
-    markdown
+                                       )
   end
   def self.get_page_title(html)
-    Nokogiri::HTML(html).search('title').text
+    Nokogiri::HTML(html).search('head').search('title').text
+  end
+  def self.get_author(html)
+    name_meta = Nokogiri::HTML(html).search('meta[name="author"]')
+    name_meta.attribute('content').value unless name_meta.empty?
   end
   class Client
@@ -63,6 +64,7 @@ module Textract
     attr_reader :tags
     attr_reader :title
     attr_reader :text
+    attr_reader :author
     def initialize(url, selectors)
       @url = url
@@ -70,14 +72,11 @@ module Textract
       agent.user_agent_alias = 'Mac Safari'
       @html = agent.get(url).content
       @tags = Textract.get_og_tags(@html)
-      if @tags.nil? or @tags.description.nil?
-        # use readability method
-        @text = Textract.get_text_from_description(@html, nil, selectors)
-        @title = Textract.get_page_title(@html)
-      else
-        @text = Textract.get_text_from_description(@html, @tags.description, selectors)
-        @title = @tags.title
-      end
+      @article = Textract.get_text_from_description(@html, @tags.description, selectors)
+      @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
+      @author = @article.author || Textract.get_author(@html)
+      @title = @tags.title || Textract.get_page_title(@html)
     end
   end
 end

data/lib/textract/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Textract
-  VERSION = "0.0.2"
+  VERSION = "0.0.3"
 end

data/spec/lib/textract_spec.rb CHANGED Viewed

@@ -2,27 +2,28 @@ require_relative '../../lib/textract'
 describe Textract do
   it "initializes with the get_text method" do
     url = "http://www.tedcruz.org/about/"
-    textract = Textract.get_text(url)
-    expect(textract).to be_a_kind_of Textract::Client
+    article = Textract.get_text(url)
+    expect(article).to be_a_kind_of Textract::Client
   end
   it "returns article text based on article tag" do
     url = "http://gawker.com/1694508525"
-    textract = Textract.get_text(url)
-    expect(textract.text[0..5]).to eq "Import"
+    article = Textract.get_text(url)
+    expect(article.text[0..5]).to eq "Import"
+    expect(article.author).to eq "Hamilton Nolan"
   end
   it "returns article text based on opengraph description" do
     url = "http://www.tedcruz.org/record/our-standard-the-constitution/"
-    textract = Textract.get_text(url)
-    expect(textract.text[0..5]).to eq "Ted Cr"
+    article = Textract.get_text(url)
+    expect(article.text[0..5]).to eq "Ted Cr"
   end
   it "can find a twitter profile given a selector" do
     url = "https://twitter.com/lifehacker"
-    textract = Textract.get_text(url, 'p.ProfileHeaderCard-bio.u-dir')
-    expect(textract.text.strip).to eq "Don't live to geek; geek to live."
-    expect(textract.title).to eq "Lifehacker (@lifehacker) | Twitter"
+    article = Textract.get_text(url, 'p.ProfileHeaderCard-bio.u-dir')
+    expect(article.text.strip).to eq "Don't live to geek; geek to live."
+    expect(article.title).to eq "Lifehacker (@lifehacker) | Twitter"
   end
   it "gets the page title from the title tag" do
@@ -30,4 +31,9 @@ describe Textract do
     expect(Textract.get_page_title(html)).to eq "Stuff"
   end
+  it "gets the author from the meta name tag" do
+    html = '<html><head><meta name="author" content="Adam Pash"></head><body><h1>FOO!</h1></body></html>'
+    expect(Textract.get_author(html)).to eq "Adam Pash"
+  end
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: textract
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Adam Pash