alchemy_api 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/.document +5 -0
  2. data/.gitignore +24 -0
  3. data/LICENSE +20 -0
  4. data/README.markdown +36 -0
  5. data/Rakefile +47 -0
  6. data/VERSION +1 -0
  7. data/alchemy_api.gemspec +92 -0
  8. data/lib/alchemy_api.rb +29 -0
  9. data/lib/alchemy_api/base.rb +37 -0
  10. data/lib/alchemy_api/categorization.rb +47 -0
  11. data/lib/alchemy_api/language_detection.rb +51 -0
  12. data/lib/alchemy_api/term_extraction.rb +56 -0
  13. data/lib/alchemy_api/text_extraction.rb +85 -0
  14. data/spec/alchemy_api/base_spec.rb +62 -0
  15. data/spec/alchemy_api/categorization_spec.rb +54 -0
  16. data/spec/alchemy_api/language_detection_spec.rb +72 -0
  17. data/spec/alchemy_api/term_extraction_spec.rb +70 -0
  18. data/spec/alchemy_api/text_extraction_spec.rb +94 -0
  19. data/spec/alchemy_api_spec.rb +10 -0
  20. data/spec/cache/categorization/get_categorization_from_html/ddc3cf50efe5bd5c2159abfb49121cfa2314ca88.cache +29 -0
  21. data/spec/cache/categorization/get_categorization_from_text/8b476a3b532afd2da646b145e9dde07570c27352.cache +29 -0
  22. data/spec/cache/categorization/get_categorization_from_url/7536a34e1d54a95d8ee07d2a98036362761e1621.cache +27 -0
  23. data/spec/cache/language_detection/get_language_from_html/0faf7be978647b611d9c59e1efa497dd76e542f5.cache +33 -0
  24. data/spec/cache/language_detection/get_language_from_text/1ad3f50c1fda37000e24c196f12212ea9d536cb4.cache +33 -0
  25. data/spec/cache/language_detection/get_language_from_url/d077a95e60be0876bb7650ad213f5f43e83454d4.cache +31 -0
  26. data/spec/cache/term_extraction/get_ranked_keywords_from_html/7718a0fbd03739e4213a4e66c32a79a10c3499c3.cache +50 -0
  27. data/spec/cache/term_extraction/get_ranked_keywords_from_text/6f49e68ee4a9150368e671e70b632dbdc40860bb.cache +51 -0
  28. data/spec/cache/term_extraction/get_ranked_keywords_from_url/b9c291523159563d2224d676ec43b7b79a902d21.cache +48 -0
  29. data/spec/cache/text_extraction/get_raw_text_from_html/9db19f848a798db1f9a8c6cce9074d03cf2637a8.cache +27 -0
  30. data/spec/cache/text_extraction/get_raw_text_from_html/e9c236b6e861b57d238c810bb3c307cada170cad.cache +17 -0
  31. data/spec/cache/text_extraction/get_raw_text_from_url/8f5dff27211163e41ea5e7c3c534acf7b87d2098.cache +25 -0
  32. data/spec/cache/text_extraction/get_text_from_html/e7e6dba4c8570a41dbcb05233793018fc5ae4e1e.cache +27 -0
  33. data/spec/cache/text_extraction/get_text_from_url/13facbfeae029d936c7dc18ecaff5d2764b94618.cache +25 -0
  34. data/spec/cache/text_extraction/get_title_from_html/2a526348db23f992fee293d34f94c087e77290c5.cache +27 -0
  35. data/spec/cache/text_extraction/get_title_from_url/e84c0c7c67668706ae0cf3eefcd88c0911cd2b65.cache +25 -0
  36. data/spec/fixtures/article.txt +9 -0
  37. data/spec/fixtures/bp_spill.html +929 -0
  38. data/spec/spec.opts +1 -0
  39. data/spec/spec_helper.rb +20 -0
  40. metadata +136 -0
@@ -0,0 +1,85 @@
1
+ module AlchemyApi
2
+ ExtractedText = Struct.new(:url, :text)
3
+ ExtractedTitle = Struct.new(:url, :title)
4
+
5
+ class TextExtraction < Base
6
+ # Usage:
7
+ # AlchemyApi::TextExtraction.get_text_from_url(
8
+ # "http://google.com",
9
+ # :use_metadata => 1,
10
+ # :extract_links => 1)
11
+ post(:get_text_from_url) do |url, *args|
12
+ options = args.first || {}
13
+ uri "#{AlchemyApi.base_uri}/URLGetText"
14
+ params :url => url,
15
+ :useMetadata => options[:use_metadata] || 1,
16
+ :extractLinks => options[:extract_links] || 0
17
+
18
+ handler do |response|
19
+ AlchemyApi::TextExtraction.get_text_from_url_handler(response)
20
+ end
21
+ end
22
+
23
+ post(:get_raw_text_from_url) do |url|
24
+ uri "#{AlchemyApi.base_uri}/URLGetRawText"
25
+ params :url => url
26
+ handler do |response|
27
+ AlchemyApi::TextExtraction.get_text_from_url_handler(response)
28
+ end
29
+ end
30
+
31
+ post(:get_title_from_url) do |url, *args|
32
+ options = args.first || {}
33
+ uri "#{AlchemyApi.base_uri}/URLGetTitle"
34
+ params :url => url,
35
+ :useMetadata => options[:use_metadata] || 1
36
+ handler do |response|
37
+ AlchemyApi::TextExtraction.get_title_from_url_handler(response)
38
+ end
39
+ end
40
+
41
+ post(:get_text_from_html) do |html, *args|
42
+ options = args.first || {}
43
+ uri "#{AlchemyApi.base_html_uri}/HTMLGetText"
44
+ params :html => html,
45
+ :url => options[:url] || '',
46
+ :useMetadata => options[:use_metadata] || 1,
47
+ :extractLinks => options[:extract_links] || 0
48
+ handler do |response|
49
+ AlchemyApi::TextExtraction.get_text_from_url_handler(response)
50
+ end
51
+ end
52
+
53
+ post(:get_raw_text_from_html) do |html, *args|
54
+ options = args.first || {}
55
+ uri "#{AlchemyApi.base_html_uri}/HTMLGetRawText"
56
+ params :html => html,
57
+ :url => options[:url] || ''
58
+ handler do |response|
59
+ AlchemyApi::TextExtraction.get_text_from_url_handler(response)
60
+ end
61
+ end
62
+
63
+ post(:get_title_from_html) do |html, *args|
64
+ options = args.first || {}
65
+ uri "#{AlchemyApi.base_html_uri}/HTMLGetTitle"
66
+ params :html => html,
67
+ :url => options[:url] || '',
68
+ :useMetadata => options[:use_metadata] || 1
69
+ handler do |response|
70
+ AlchemyApi::TextExtraction.get_title_from_url_handler(response)
71
+ end
72
+ end
73
+
74
+ def self.get_title_from_url_handler(response)
75
+ json = get_json(response)
76
+ ExtractedTitle.new(json['url'], json['title'])
77
+ end
78
+
79
+ def self.get_text_from_url_handler(response)
80
+ json = get_json(response)
81
+ check_json_for_errors_and_raise!(json)
82
+ ExtractedText.new(json['url'], json['text'])
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,62 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe AlchemyApi::Base do
4
+ describe "#check_json_for_errors_and_raise!" do
5
+ before(:each) do
6
+ @json = {
7
+ 'status' => 'ERROR',
8
+ 'url' => 'http://google.com',
9
+ 'statusInfo' => nil # replace in each test.
10
+ }
11
+ end
12
+
13
+ it "should raise an error if the API key is invalid" do
14
+ @json['statusInfo'] = 'invalid-api-key'
15
+ lambda {
16
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
17
+ }.should raise_error(AlchemyApi::InvalidApiKeyError)
18
+ end
19
+
20
+ it "should raise an error if the page is not retrievable" do
21
+ @json['statusInfo'] = 'cannot-retrieve'
22
+ lambda {
23
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
24
+ }.should raise_error(AlchemyApi::CannotRetrieveUrlError)
25
+ end
26
+
27
+ it "should raise an error if the page is not valid HTML" do
28
+ @json['statusInfo'] = 'page-is-not-html'
29
+ lambda {
30
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
31
+ }.should raise_error(AlchemyApi::PageIsNotValidHtmlError)
32
+ end
33
+
34
+ it "should raise an error if the sent HTML was not valid" do
35
+ @json['statusInfo'] = 'invalid-html'
36
+ lambda {
37
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
38
+ }.should raise_error(AlchemyApi::InvalidHtmlError)
39
+ end
40
+
41
+ it "should raise an error if the content exceeds the max limit" do
42
+ @json['statusInfo'] = 'content-exceeds-size-limit'
43
+ lambda {
44
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
45
+ }.should raise_error(AlchemyApi::ContentExceedsMaxLimitError)
46
+ end
47
+
48
+ it "should raise an error if the content cannot be retrieve due to redirection limit" do
49
+ @json['statusInfo'] = 'cannot-retrieve:http-redirect-limit'
50
+ lambda {
51
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
52
+ }.should raise_error(AlchemyApi::RedirectionLimitError)
53
+ end
54
+
55
+ it "should raise an UnknownError if we get something we don't recognize" do
56
+ @json['statusInfo'] = 'fdsafdsfdsafdskjldklfdad'
57
+ lambda {
58
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
59
+ }.should raise_error(AlchemyApi::UnknownError)
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,54 @@
1
+ require File.dirname(__FILE__) + "/../spec_helper"
2
+
3
+ describe AlchemyApi::Categorization do
4
+ typhoeus_spec_cache('spec/cache/categorization/get_categorization_from_text') do |hydra|
5
+ describe "#get_categorization_from_text" do
6
+ before(:each) do
7
+ @url = "http://test.com"
8
+ text = fixture_for('article.txt')
9
+
10
+ @category = AlchemyApi::Categorization.
11
+ get_categorization_from_text(text)
12
+ end
13
+
14
+ it "should return a category name" do
15
+ @category.name.should_not be_nil
16
+ end
17
+ end
18
+ end
19
+
20
+ typhoeus_spec_cache('spec/cache/categorization/get_categorization_from_url') do |hydra|
21
+ describe "#get_categorization_from_url" do
22
+ before(:each) do
23
+ @url = 'http://www.macrumors.com/2010/04/30/apples-discontinuation-of-lala-streaming-music-service-not-likely-leading-to-imminent-launch-of-web-focused-itunes/'
24
+ @category = AlchemyApi::Categorization.
25
+ get_categorization_from_url(@url,
26
+ :source_text => 'cleaned_or_raw')
27
+ end
28
+
29
+ it "should return a category name" do
30
+ @category.name.should_not be_nil
31
+ end
32
+
33
+ it "should return a url" do
34
+ @category.url.should_not be_nil
35
+ end
36
+ end
37
+ end
38
+
39
+ typhoeus_spec_cache('spec/cache/categorization/get_categorization_from_html') do |hydra|
40
+ describe "#get_categorization_from_html" do
41
+ before(:each) do
42
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
43
+ @html = fixture_for('bp_spill.html')
44
+ @category = AlchemyApi::Categorization.
45
+ get_categorization_from_html(@html, :url => @url,
46
+ :source_text => 'cleaned_or_raw')
47
+ end
48
+
49
+ it "should return a category" do
50
+ @category.name.should_not be_nil
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,72 @@
1
+ require File.dirname(__FILE__) + "/../spec_helper"
2
+
3
+ describe AlchemyApi::LanguageDetection do
4
+ shared_examples_for 'a language detector' do
5
+ it "should get a language" do
6
+ @result.language.should_not be_nil
7
+ end
8
+
9
+ it "should get ISO codes" do
10
+ @result.iso_639_1.should_not be_nil
11
+ @result.iso_639_2.should_not be_nil
12
+ @result.iso_639_3.should_not be_nil
13
+ end
14
+
15
+ it "should get the ethnologue URL" do
16
+ @result.ethnologue_url.should_not be_nil
17
+ end
18
+
19
+ it "should get a native speaker count" do
20
+ @result.native_speakers.should_not be_nil
21
+ end
22
+
23
+ it "should get a wikipedia URL" do
24
+ @result.wikipedia_url.should_not be_nil
25
+ end
26
+ end
27
+
28
+ typhoeus_spec_cache('spec/cache/language_detection/get_language_from_url') do |hydra|
29
+ describe "#get_language_from_url" do
30
+ before(:each) do
31
+ @url = 'http://www.humboldtbrews.com/2010_index_music.htm'
32
+ @result = AlchemyApi::LanguageDetection.
33
+ get_language_from_url(@url,
34
+ :source_text => 'cleaned_or_raw')
35
+ end
36
+
37
+ it_should_behave_like 'a language detector'
38
+
39
+ it "should get the URL" do
40
+ @result.url.should == @url
41
+ end
42
+ end
43
+ end
44
+
45
+ typhoeus_spec_cache('spec/cache/language_detection/get_language_from_text') do |hydra|
46
+ describe "#get_language_from_text" do
47
+ before(:each) do
48
+ @url = "http://test.com"
49
+ text = fixture_for('article.txt')
50
+ @result = AlchemyApi::LanguageDetection.
51
+ get_language_from_text(text,
52
+ :url => @url)
53
+ end
54
+
55
+ it_should_behave_like 'a language detector'
56
+ end
57
+ end
58
+
59
+ typhoeus_spec_cache('spec/cache/language_detection/get_language_from_html') do |hydra|
60
+ describe "#get_language_from_html" do
61
+ before(:each) do
62
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
63
+ @html = fixture_for('bp_spill.html')
64
+ @result = AlchemyApi::LanguageDetection.
65
+ get_language_from_html(@html, :url => @url,
66
+ :source_text => 'cleaned_or_raw')
67
+ end
68
+
69
+ it_should_behave_like 'a language detector'
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,70 @@
1
+ require File.dirname(__FILE__) + "/../spec_helper"
2
+
3
+ describe AlchemyApi::TermExtraction do
4
+ shared_examples_for 'a keyword ranker' do
5
+ it "should return source text" do
6
+ @result.source_text.should_not be_nil
7
+ end
8
+
9
+ it "should return 5 keywords" do
10
+ @result.keywords.should have(5).things
11
+ end
12
+
13
+ it "should have relevance scores for the keywords" do
14
+ @result.keywords.each do |kw|
15
+ kw.relevance.should >= 0.0
16
+ kw.relevance.should <= 1.0
17
+ end
18
+ end
19
+ end
20
+
21
+ typhoeus_spec_cache('spec/cache/term_extraction/get_ranked_keywords_from_html') do |hydra|
22
+ describe "#get_ranked_keywords_from_html" do
23
+ before(:each) do
24
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
25
+ @html = fixture_for('bp_spill.html')
26
+ @result = AlchemyApi::TermExtraction.
27
+ get_ranked_keywords_from_html(@html,
28
+ :url => @url,
29
+ :max_retrieve => 5,
30
+ :show_source_text => true)
31
+ end
32
+
33
+ it_should_behave_like 'a keyword ranker'
34
+ end
35
+ end
36
+
37
+ typhoeus_spec_cache('spec/cache/term_extraction/get_ranked_keywords_from_text') do |hydra|
38
+ describe "#get_ranked_keywords_from_text" do
39
+ before(:each) do
40
+ @url = "http://test.com"
41
+ text = fixture_for('article.txt')
42
+ @result = AlchemyApi::TermExtraction.
43
+ get_ranked_keywords_from_text(text,
44
+ :url => @url,
45
+ :max_retrieve => 5,
46
+ :show_source_text => true)
47
+ end
48
+
49
+ it_should_behave_like 'a keyword ranker'
50
+ end
51
+ end
52
+
53
+ typhoeus_spec_cache('spec/cache/term_extraction/get_ranked_keywords_from_url') do |hydra|
54
+ describe "#get_ranked_keywords_from_url" do
55
+ before(:each) do
56
+ @url = 'http://www.businessweek.com/news/2010-05-02/bp-spill-threatens-gulf-of-mexico-oil-gas-operations-update1-.html'
57
+ @result = AlchemyApi::TermExtraction.
58
+ get_ranked_keywords_from_url(@url,
59
+ :max_retrieve => 5,
60
+ :show_source_text => true)
61
+ end
62
+
63
+ it "should return the given URL" do
64
+ @result.url.should == @url
65
+ end
66
+
67
+ it_should_behave_like 'a keyword ranker'
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,94 @@
1
+ require File.dirname(__FILE__) + "/../spec_helper"
2
+
3
+ describe AlchemyApi::TextExtraction do
4
+ typhoeus_spec_cache('spec/cache/text_extraction/get_text_from_url') do |hydra|
5
+ describe "#get_text_from_url" do
6
+ it "should extract text" do
7
+ url = "http://www.chron.com/disp/story.mpl/business/6981685.html"
8
+ result = AlchemyApi::TextExtraction.get_text_from_url(url)
9
+
10
+ result.url.should == url
11
+ result.text.should_not be_empty
12
+ end
13
+ end
14
+ end
15
+
16
+ typhoeus_spec_cache('spec/cache/text_extraction/get_raw_text_from_url') do |hydra|
17
+ describe "#get_raw_text_from_url" do
18
+ before(:each) do
19
+ @url = "http://www.chron.com/disp/story.mpl/business/6981685.html"
20
+ @result = AlchemyApi::TextExtraction.get_raw_text_from_url(@url)
21
+ end
22
+
23
+ it "should extract url" do
24
+ @result.url.should == @url
25
+ end
26
+
27
+ it "should get text back" do
28
+ @result.text.should_not be_empty
29
+ end
30
+ end
31
+ end
32
+
33
+ typhoeus_spec_cache('spec/cache/text_extraction/get_title_from_url') do |hydra|
34
+ describe "#get_title_from_url" do
35
+ before(:each) do
36
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
37
+ @result = AlchemyApi::TextExtraction.get_title_from_url(@url)
38
+ end
39
+
40
+ it "should extract url" do
41
+ @result.url.should == @url
42
+ end
43
+
44
+ it "should get text back" do
45
+ @result.title.should =~ /BP Spill/
46
+ end
47
+ end
48
+ end
49
+
50
+ typhoeus_spec_cache('spec/cache/text_extraction/get_title_from_html') do |hydra|
51
+ describe "#get_title_from_html" do
52
+ before(:each) do
53
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
54
+ @html = fixture_for('bp_spill.html')
55
+ @result = AlchemyApi::TextExtraction.
56
+ get_title_from_html(@html, :url => @url)
57
+ end
58
+
59
+ it "should get title back" do
60
+ @result.title.should =~ /BP Spill/
61
+ end
62
+ end
63
+ end
64
+
65
+ typhoeus_spec_cache('spec/cache/text_extraction/get_raw_text_from_html') do |hydra|
66
+ describe "#get_raw_text_from_html" do
67
+ before(:each) do
68
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
69
+ @html = fixture_for('bp_spill.html')
70
+ @result = AlchemyApi::TextExtraction.
71
+ get_raw_text_from_html(@html, :url => @url)
72
+ end
73
+
74
+ it "should get text back" do
75
+ @result.text.should_not be_empty
76
+ end
77
+ end
78
+ end
79
+
80
+ typhoeus_spec_cache('spec/cache/text_extraction/get_text_from_html') do |hydra|
81
+ describe "#get_text_from_html" do
82
+ before(:each) do
83
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
84
+ @html = fixture_for('bp_spill.html')
85
+ @result = AlchemyApi::TextExtraction.
86
+ get_text_from_html(@html, :url => @url)
87
+ end
88
+
89
+ it "should get text back" do
90
+ @result.text.should_not be_empty
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,10 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe AlchemyApi do
4
+ describe "#api_key" do
5
+ it "should be settable" do
6
+ AlchemyApi.api_key = "fdsa"
7
+ AlchemyApi.api_key.should == 'fdsa'
8
+ end
9
+ end
10
+ end