RubyGems - alchemy_api - Versions diffs - 0.1.0 - Mend

alchemy_api 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

data/lib/alchemy_api/text_extraction.rb ADDED Viewed

@@ -0,0 +1,85 @@
+module AlchemyApi
+  ExtractedText = Struct.new(:url, :text)
+  ExtractedTitle = Struct.new(:url, :title)
+  class TextExtraction < Base
+    # Usage:
+    # AlchemyApi::TextExtraction.get_text_from_url(
+    #     "http://google.com",
+    #     :use_metadata => 1,
+    #     :extract_links => 1)
+    post(:get_text_from_url) do |url, *args|
+      options = args.first || {}
+      uri "#{AlchemyApi.base_uri}/URLGetText"
+      params :url => url,
+             :useMetadata => options[:use_metadata] || 1,
+             :extractLinks => options[:extract_links] || 0
+      handler do |response|
+        AlchemyApi::TextExtraction.get_text_from_url_handler(response)
+      end
+    end
+    post(:get_raw_text_from_url) do |url|
+      uri "#{AlchemyApi.base_uri}/URLGetRawText"
+      params :url => url
+      handler do |response|
+        AlchemyApi::TextExtraction.get_text_from_url_handler(response)
+      end
+    end
+    post(:get_title_from_url) do |url, *args|
+      options = args.first || {}
+      uri "#{AlchemyApi.base_uri}/URLGetTitle"
+      params :url => url,
+             :useMetadata => options[:use_metadata] || 1
+      handler do |response|
+        AlchemyApi::TextExtraction.get_title_from_url_handler(response)
+      end
+    end
+    post(:get_text_from_html) do |html, *args|
+      options = args.first || {}
+      uri "#{AlchemyApi.base_html_uri}/HTMLGetText"
+      params :html => html,
+             :url => options[:url] || '',
+             :useMetadata => options[:use_metadata] || 1,
+             :extractLinks => options[:extract_links] || 0
+      handler do |response|
+        AlchemyApi::TextExtraction.get_text_from_url_handler(response)
+      end
+    end
+    post(:get_raw_text_from_html) do |html, *args|
+      options = args.first || {}
+      uri "#{AlchemyApi.base_html_uri}/HTMLGetRawText"
+      params :html => html,
+             :url => options[:url] || ''
+      handler do |response|
+        AlchemyApi::TextExtraction.get_text_from_url_handler(response)
+      end
+    end
+    post(:get_title_from_html) do |html, *args|
+      options = args.first || {}
+      uri "#{AlchemyApi.base_html_uri}/HTMLGetTitle"
+      params :html => html,
+             :url => options[:url] || '',
+             :useMetadata => options[:use_metadata] || 1
+      handler do |response|
+        AlchemyApi::TextExtraction.get_title_from_url_handler(response)
+      end
+    end
+    def self.get_title_from_url_handler(response)
+      json = get_json(response)
+      ExtractedTitle.new(json['url'], json['title'])
+    end
+    def self.get_text_from_url_handler(response)
+      json = get_json(response)
+      check_json_for_errors_and_raise!(json)
+      ExtractedText.new(json['url'], json['text'])
+    end
+  end
+end

data/spec/alchemy_api/base_spec.rb ADDED Viewed

@@ -0,0 +1,62 @@
+require File.dirname(__FILE__) + '/../spec_helper'
+describe AlchemyApi::Base do
+  describe "#check_json_for_errors_and_raise!" do
+    before(:each) do
+      @json = {
+        'status' => 'ERROR',
+        'url' => 'http://google.com',
+        'statusInfo' => nil # replace in each test.
+      }
+    end
+    it "should raise an error if the API key is invalid" do
+      @json['statusInfo'] = 'invalid-api-key'
+      lambda {
+        AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
+      }.should raise_error(AlchemyApi::InvalidApiKeyError)
+    end
+    it "should raise an error if the page is not retrievable" do
+      @json['statusInfo'] = 'cannot-retrieve'
+      lambda {
+        AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
+      }.should raise_error(AlchemyApi::CannotRetrieveUrlError)
+    end
+    it "should raise an error if the page is not valid HTML" do
+      @json['statusInfo'] = 'page-is-not-html'
+      lambda {
+        AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
+      }.should raise_error(AlchemyApi::PageIsNotValidHtmlError)
+    end
+    it "should raise an error if the sent HTML was not valid" do
+      @json['statusInfo'] = 'invalid-html'
+      lambda {
+        AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
+      }.should raise_error(AlchemyApi::InvalidHtmlError)
+    end
+    it "should raise an error if the content exceeds the max limit" do
+      @json['statusInfo'] = 'content-exceeds-size-limit'
+      lambda {
+        AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
+      }.should raise_error(AlchemyApi::ContentExceedsMaxLimitError)
+    end
+    it "should raise an error if the content cannot be retrieve due to redirection limit" do
+      @json['statusInfo'] = 'cannot-retrieve:http-redirect-limit'
+      lambda {
+        AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
+      }.should raise_error(AlchemyApi::RedirectionLimitError)
+    end
+    it "should raise an UnknownError if we get something we don't recognize" do
+      @json['statusInfo'] = 'fdsafdsfdsafdskjldklfdad'
+      lambda {
+        AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
+      }.should raise_error(AlchemyApi::UnknownError)
+    end
+  end
+end

data/spec/alchemy_api/categorization_spec.rb ADDED Viewed

@@ -0,0 +1,54 @@
+require File.dirname(__FILE__) + "/../spec_helper"
+describe AlchemyApi::Categorization do
+  typhoeus_spec_cache('spec/cache/categorization/get_categorization_from_text') do |hydra|
+    describe "#get_categorization_from_text" do
+      before(:each) do
+        @url = "http://test.com"
+        text = fixture_for('article.txt')
+        @category = AlchemyApi::Categorization.
+          get_categorization_from_text(text)
+      end
+      it "should return a category name" do
+        @category.name.should_not be_nil
+      end
+    end
+  end
+  typhoeus_spec_cache('spec/cache/categorization/get_categorization_from_url') do |hydra|
+    describe "#get_categorization_from_url" do
+      before(:each) do
+        @url = 'http://www.macrumors.com/2010/04/30/apples-discontinuation-of-lala-streaming-music-service-not-likely-leading-to-imminent-launch-of-web-focused-itunes/'
+        @category = AlchemyApi::Categorization.
+          get_categorization_from_url(@url,
+                                      :source_text => 'cleaned_or_raw')
+      end
+      it "should return a category name" do
+        @category.name.should_not be_nil
+      end
+      it "should return a url" do
+        @category.url.should_not be_nil
+      end
+    end
+  end
+  typhoeus_spec_cache('spec/cache/categorization/get_categorization_from_html') do |hydra|
+    describe "#get_categorization_from_html" do
+      before(:each) do
+        @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
+        @html = fixture_for('bp_spill.html')
+        @category = AlchemyApi::Categorization.
+          get_categorization_from_html(@html, :url => @url,
+                                       :source_text => 'cleaned_or_raw')
+      end
+      it "should return a category" do
+        @category.name.should_not be_nil
+      end
+    end
+  end
+end

data/spec/alchemy_api/language_detection_spec.rb ADDED Viewed

@@ -0,0 +1,72 @@
+require File.dirname(__FILE__) + "/../spec_helper"
+describe AlchemyApi::LanguageDetection do
+  shared_examples_for 'a language detector' do
+    it "should get a language" do
+      @result.language.should_not be_nil
+    end
+    it "should get ISO codes" do
+      @result.iso_639_1.should_not be_nil
+      @result.iso_639_2.should_not be_nil
+      @result.iso_639_3.should_not be_nil
+    end
+    it "should get the ethnologue URL" do
+      @result.ethnologue_url.should_not be_nil
+    end
+    it "should get a native speaker count" do
+      @result.native_speakers.should_not be_nil
+    end
+    it "should get a wikipedia URL" do
+      @result.wikipedia_url.should_not be_nil
+    end
+  end
+  typhoeus_spec_cache('spec/cache/language_detection/get_language_from_url') do |hydra|
+    describe "#get_language_from_url" do
+      before(:each) do
+        @url = 'http://www.humboldtbrews.com/2010_index_music.htm'
+        @result = AlchemyApi::LanguageDetection.
+          get_language_from_url(@url,
+                                :source_text => 'cleaned_or_raw')
+      end
+      it_should_behave_like 'a language detector'
+      it "should get the URL" do
+        @result.url.should == @url
+      end
+    end
+  end
+  typhoeus_spec_cache('spec/cache/language_detection/get_language_from_text') do |hydra|
+    describe "#get_language_from_text" do
+      before(:each) do
+        @url = "http://test.com"
+        text = fixture_for('article.txt')
+        @result = AlchemyApi::LanguageDetection.
+          get_language_from_text(text,
+                                 :url => @url)
+      end
+      it_should_behave_like 'a language detector'
+    end
+  end
+  typhoeus_spec_cache('spec/cache/language_detection/get_language_from_html') do |hydra|
+    describe "#get_language_from_html" do
+      before(:each) do
+        @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
+        @html = fixture_for('bp_spill.html')
+        @result = AlchemyApi::LanguageDetection.
+          get_language_from_html(@html, :url => @url,
+                                 :source_text => 'cleaned_or_raw')
+      end
+      it_should_behave_like 'a language detector'
+    end
+  end
+end

data/spec/alchemy_api/term_extraction_spec.rb ADDED Viewed

@@ -0,0 +1,70 @@
+require File.dirname(__FILE__) + "/../spec_helper"
+describe AlchemyApi::TermExtraction do
+  shared_examples_for 'a keyword ranker' do
+    it "should return source text" do
+      @result.source_text.should_not be_nil
+    end
+    it "should return 5 keywords" do
+      @result.keywords.should have(5).things
+    end
+    it "should have relevance scores for the keywords" do
+      @result.keywords.each do |kw|
+        kw.relevance.should >= 0.0
+        kw.relevance.should <= 1.0
+      end
+    end
+  end
+  typhoeus_spec_cache('spec/cache/term_extraction/get_ranked_keywords_from_html') do |hydra|
+    describe "#get_ranked_keywords_from_html" do
+      before(:each) do
+        @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
+        @html = fixture_for('bp_spill.html')
+        @result = AlchemyApi::TermExtraction.
+          get_ranked_keywords_from_html(@html,
+                                        :url => @url,
+                                        :max_retrieve => 5,
+                                        :show_source_text => true)
+      end
+      it_should_behave_like 'a keyword ranker'
+    end
+  end
+  typhoeus_spec_cache('spec/cache/term_extraction/get_ranked_keywords_from_text') do |hydra|
+    describe "#get_ranked_keywords_from_text" do
+      before(:each) do
+        @url = "http://test.com"
+        text = fixture_for('article.txt')
+        @result = AlchemyApi::TermExtraction.
+          get_ranked_keywords_from_text(text,
+                                        :url => @url,
+                                        :max_retrieve => 5,
+                                        :show_source_text => true)
+      end
+      it_should_behave_like 'a keyword ranker'
+    end
+  end
+  typhoeus_spec_cache('spec/cache/term_extraction/get_ranked_keywords_from_url') do |hydra|
+    describe "#get_ranked_keywords_from_url" do
+      before(:each) do
+        @url = 'http://www.businessweek.com/news/2010-05-02/bp-spill-threatens-gulf-of-mexico-oil-gas-operations-update1-.html'
+        @result = AlchemyApi::TermExtraction.
+          get_ranked_keywords_from_url(@url,
+                                       :max_retrieve => 5,
+                                       :show_source_text => true)
+      end
+      it "should return the given URL" do
+        @result.url.should == @url
+      end
+      it_should_behave_like 'a keyword ranker'
+    end
+  end
+end

data/spec/alchemy_api/text_extraction_spec.rb ADDED Viewed

@@ -0,0 +1,94 @@
+require File.dirname(__FILE__) + "/../spec_helper"
+describe AlchemyApi::TextExtraction do
+  typhoeus_spec_cache('spec/cache/text_extraction/get_text_from_url') do |hydra|
+    describe "#get_text_from_url" do
+      it "should extract text" do
+        url = "http://www.chron.com/disp/story.mpl/business/6981685.html"
+        result = AlchemyApi::TextExtraction.get_text_from_url(url)
+        result.url.should == url
+        result.text.should_not be_empty
+      end
+    end
+  end
+  typhoeus_spec_cache('spec/cache/text_extraction/get_raw_text_from_url') do |hydra|
+    describe "#get_raw_text_from_url" do
+      before(:each) do
+        @url = "http://www.chron.com/disp/story.mpl/business/6981685.html"
+        @result = AlchemyApi::TextExtraction.get_raw_text_from_url(@url)
+      end
+      it "should extract url" do
+        @result.url.should == @url
+      end
+      it "should get text back" do
+        @result.text.should_not be_empty
+      end
+    end
+  end
+  typhoeus_spec_cache('spec/cache/text_extraction/get_title_from_url') do |hydra|
+    describe "#get_title_from_url" do
+      before(:each) do
+        @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
+        @result = AlchemyApi::TextExtraction.get_title_from_url(@url)
+      end
+      it "should extract url" do
+        @result.url.should == @url
+      end
+      it "should get text back" do
+        @result.title.should =~ /BP Spill/
+      end
+    end
+  end
+  typhoeus_spec_cache('spec/cache/text_extraction/get_title_from_html') do |hydra|
+    describe "#get_title_from_html" do
+      before(:each) do
+        @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
+        @html = fixture_for('bp_spill.html')
+        @result = AlchemyApi::TextExtraction.
+          get_title_from_html(@html, :url => @url)
+      end
+      it "should get title back" do
+        @result.title.should =~ /BP Spill/
+      end
+    end
+  end
+  typhoeus_spec_cache('spec/cache/text_extraction/get_raw_text_from_html') do |hydra|
+    describe "#get_raw_text_from_html" do
+      before(:each) do
+        @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
+        @html = fixture_for('bp_spill.html')
+        @result = AlchemyApi::TextExtraction.
+          get_raw_text_from_html(@html, :url => @url)
+      end
+      it "should get text back" do
+        @result.text.should_not be_empty
+      end
+    end
+  end
+  typhoeus_spec_cache('spec/cache/text_extraction/get_text_from_html') do |hydra|
+    describe "#get_text_from_html" do
+      before(:each) do
+        @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
+        @html = fixture_for('bp_spill.html')
+        @result = AlchemyApi::TextExtraction.
+          get_text_from_html(@html, :url => @url)
+      end
+      it "should get text back" do
+        @result.text.should_not be_empty
+      end
+    end
+  end
+end

data/spec/alchemy_api_spec.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require File.dirname(__FILE__) + '/spec_helper'
+describe AlchemyApi do
+  describe "#api_key" do
+    it "should be settable" do
+      AlchemyApi.api_key = "fdsa"
+      AlchemyApi.api_key.should == 'fdsa'
+    end
+  end
+end