alchemy_api 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/.document +5 -0
  2. data/.gitignore +24 -0
  3. data/LICENSE +20 -0
  4. data/README.markdown +36 -0
  5. data/Rakefile +47 -0
  6. data/VERSION +1 -0
  7. data/alchemy_api.gemspec +92 -0
  8. data/lib/alchemy_api.rb +29 -0
  9. data/lib/alchemy_api/base.rb +37 -0
  10. data/lib/alchemy_api/categorization.rb +47 -0
  11. data/lib/alchemy_api/language_detection.rb +51 -0
  12. data/lib/alchemy_api/term_extraction.rb +56 -0
  13. data/lib/alchemy_api/text_extraction.rb +85 -0
  14. data/spec/alchemy_api/base_spec.rb +62 -0
  15. data/spec/alchemy_api/categorization_spec.rb +54 -0
  16. data/spec/alchemy_api/language_detection_spec.rb +72 -0
  17. data/spec/alchemy_api/term_extraction_spec.rb +70 -0
  18. data/spec/alchemy_api/text_extraction_spec.rb +94 -0
  19. data/spec/alchemy_api_spec.rb +10 -0
  20. data/spec/cache/categorization/get_categorization_from_html/ddc3cf50efe5bd5c2159abfb49121cfa2314ca88.cache +29 -0
  21. data/spec/cache/categorization/get_categorization_from_text/8b476a3b532afd2da646b145e9dde07570c27352.cache +29 -0
  22. data/spec/cache/categorization/get_categorization_from_url/7536a34e1d54a95d8ee07d2a98036362761e1621.cache +27 -0
  23. data/spec/cache/language_detection/get_language_from_html/0faf7be978647b611d9c59e1efa497dd76e542f5.cache +33 -0
  24. data/spec/cache/language_detection/get_language_from_text/1ad3f50c1fda37000e24c196f12212ea9d536cb4.cache +33 -0
  25. data/spec/cache/language_detection/get_language_from_url/d077a95e60be0876bb7650ad213f5f43e83454d4.cache +31 -0
  26. data/spec/cache/term_extraction/get_ranked_keywords_from_html/7718a0fbd03739e4213a4e66c32a79a10c3499c3.cache +50 -0
  27. data/spec/cache/term_extraction/get_ranked_keywords_from_text/6f49e68ee4a9150368e671e70b632dbdc40860bb.cache +51 -0
  28. data/spec/cache/term_extraction/get_ranked_keywords_from_url/b9c291523159563d2224d676ec43b7b79a902d21.cache +48 -0
  29. data/spec/cache/text_extraction/get_raw_text_from_html/9db19f848a798db1f9a8c6cce9074d03cf2637a8.cache +27 -0
  30. data/spec/cache/text_extraction/get_raw_text_from_html/e9c236b6e861b57d238c810bb3c307cada170cad.cache +17 -0
  31. data/spec/cache/text_extraction/get_raw_text_from_url/8f5dff27211163e41ea5e7c3c534acf7b87d2098.cache +25 -0
  32. data/spec/cache/text_extraction/get_text_from_html/e7e6dba4c8570a41dbcb05233793018fc5ae4e1e.cache +27 -0
  33. data/spec/cache/text_extraction/get_text_from_url/13facbfeae029d936c7dc18ecaff5d2764b94618.cache +25 -0
  34. data/spec/cache/text_extraction/get_title_from_html/2a526348db23f992fee293d34f94c087e77290c5.cache +27 -0
  35. data/spec/cache/text_extraction/get_title_from_url/e84c0c7c67668706ae0cf3eefcd88c0911cd2b65.cache +25 -0
  36. data/spec/fixtures/article.txt +9 -0
  37. data/spec/fixtures/bp_spill.html +929 -0
  38. data/spec/spec.opts +1 -0
  39. data/spec/spec_helper.rb +20 -0
  40. metadata +136 -0
@@ -0,0 +1,85 @@
1
+ module AlchemyApi
2
+ ExtractedText = Struct.new(:url, :text)
3
+ ExtractedTitle = Struct.new(:url, :title)
4
+
5
+ class TextExtraction < Base
6
+ # Usage:
7
+ # AlchemyApi::TextExtraction.get_text_from_url(
8
+ # "http://google.com",
9
+ # :use_metadata => 1,
10
+ # :extract_links => 1)
11
+ post(:get_text_from_url) do |url, *args|
12
+ options = args.first || {}
13
+ uri "#{AlchemyApi.base_uri}/URLGetText"
14
+ params :url => url,
15
+ :useMetadata => options[:use_metadata] || 1,
16
+ :extractLinks => options[:extract_links] || 0
17
+
18
+ handler do |response|
19
+ AlchemyApi::TextExtraction.get_text_from_url_handler(response)
20
+ end
21
+ end
22
+
23
+ post(:get_raw_text_from_url) do |url|
24
+ uri "#{AlchemyApi.base_uri}/URLGetRawText"
25
+ params :url => url
26
+ handler do |response|
27
+ AlchemyApi::TextExtraction.get_text_from_url_handler(response)
28
+ end
29
+ end
30
+
31
+ post(:get_title_from_url) do |url, *args|
32
+ options = args.first || {}
33
+ uri "#{AlchemyApi.base_uri}/URLGetTitle"
34
+ params :url => url,
35
+ :useMetadata => options[:use_metadata] || 1
36
+ handler do |response|
37
+ AlchemyApi::TextExtraction.get_title_from_url_handler(response)
38
+ end
39
+ end
40
+
41
+ post(:get_text_from_html) do |html, *args|
42
+ options = args.first || {}
43
+ uri "#{AlchemyApi.base_html_uri}/HTMLGetText"
44
+ params :html => html,
45
+ :url => options[:url] || '',
46
+ :useMetadata => options[:use_metadata] || 1,
47
+ :extractLinks => options[:extract_links] || 0
48
+ handler do |response|
49
+ AlchemyApi::TextExtraction.get_text_from_url_handler(response)
50
+ end
51
+ end
52
+
53
+ post(:get_raw_text_from_html) do |html, *args|
54
+ options = args.first || {}
55
+ uri "#{AlchemyApi.base_html_uri}/HTMLGetRawText"
56
+ params :html => html,
57
+ :url => options[:url] || ''
58
+ handler do |response|
59
+ AlchemyApi::TextExtraction.get_text_from_url_handler(response)
60
+ end
61
+ end
62
+
63
+ post(:get_title_from_html) do |html, *args|
64
+ options = args.first || {}
65
+ uri "#{AlchemyApi.base_html_uri}/HTMLGetTitle"
66
+ params :html => html,
67
+ :url => options[:url] || '',
68
+ :useMetadata => options[:use_metadata] || 1
69
+ handler do |response|
70
+ AlchemyApi::TextExtraction.get_title_from_url_handler(response)
71
+ end
72
+ end
73
+
74
+ def self.get_title_from_url_handler(response)
75
+ json = get_json(response)
76
+ ExtractedTitle.new(json['url'], json['title'])
77
+ end
78
+
79
+ def self.get_text_from_url_handler(response)
80
+ json = get_json(response)
81
+ check_json_for_errors_and_raise!(json)
82
+ ExtractedText.new(json['url'], json['text'])
83
+ end
84
+ end
85
+ end
@@ -0,0 +1,62 @@
1
+ require File.dirname(__FILE__) + '/../spec_helper'
2
+
3
+ describe AlchemyApi::Base do
4
+ describe "#check_json_for_errors_and_raise!" do
5
+ before(:each) do
6
+ @json = {
7
+ 'status' => 'ERROR',
8
+ 'url' => 'http://google.com',
9
+ 'statusInfo' => nil # replace in each test.
10
+ }
11
+ end
12
+
13
+ it "should raise an error if the API key is invalid" do
14
+ @json['statusInfo'] = 'invalid-api-key'
15
+ lambda {
16
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
17
+ }.should raise_error(AlchemyApi::InvalidApiKeyError)
18
+ end
19
+
20
+ it "should raise an error if the page is not retrievable" do
21
+ @json['statusInfo'] = 'cannot-retrieve'
22
+ lambda {
23
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
24
+ }.should raise_error(AlchemyApi::CannotRetrieveUrlError)
25
+ end
26
+
27
+ it "should raise an error if the page is not valid HTML" do
28
+ @json['statusInfo'] = 'page-is-not-html'
29
+ lambda {
30
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
31
+ }.should raise_error(AlchemyApi::PageIsNotValidHtmlError)
32
+ end
33
+
34
+ it "should raise an error if the sent HTML was not valid" do
35
+ @json['statusInfo'] = 'invalid-html'
36
+ lambda {
37
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
38
+ }.should raise_error(AlchemyApi::InvalidHtmlError)
39
+ end
40
+
41
+ it "should raise an error if the content exceeds the max limit" do
42
+ @json['statusInfo'] = 'content-exceeds-size-limit'
43
+ lambda {
44
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
45
+ }.should raise_error(AlchemyApi::ContentExceedsMaxLimitError)
46
+ end
47
+
48
+ it "should raise an error if the content cannot be retrieve due to redirection limit" do
49
+ @json['statusInfo'] = 'cannot-retrieve:http-redirect-limit'
50
+ lambda {
51
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
52
+ }.should raise_error(AlchemyApi::RedirectionLimitError)
53
+ end
54
+
55
+ it "should raise an UnknownError if we get something we don't recognize" do
56
+ @json['statusInfo'] = 'fdsafdsfdsafdskjldklfdad'
57
+ lambda {
58
+ AlchemyApi::Base.check_json_for_errors_and_raise!(@json)
59
+ }.should raise_error(AlchemyApi::UnknownError)
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,54 @@
1
+ require File.dirname(__FILE__) + "/../spec_helper"
2
+
3
+ describe AlchemyApi::Categorization do
4
+ typhoeus_spec_cache('spec/cache/categorization/get_categorization_from_text') do |hydra|
5
+ describe "#get_categorization_from_text" do
6
+ before(:each) do
7
+ @url = "http://test.com"
8
+ text = fixture_for('article.txt')
9
+
10
+ @category = AlchemyApi::Categorization.
11
+ get_categorization_from_text(text)
12
+ end
13
+
14
+ it "should return a category name" do
15
+ @category.name.should_not be_nil
16
+ end
17
+ end
18
+ end
19
+
20
+ typhoeus_spec_cache('spec/cache/categorization/get_categorization_from_url') do |hydra|
21
+ describe "#get_categorization_from_url" do
22
+ before(:each) do
23
+ @url = 'http://www.macrumors.com/2010/04/30/apples-discontinuation-of-lala-streaming-music-service-not-likely-leading-to-imminent-launch-of-web-focused-itunes/'
24
+ @category = AlchemyApi::Categorization.
25
+ get_categorization_from_url(@url,
26
+ :source_text => 'cleaned_or_raw')
27
+ end
28
+
29
+ it "should return a category name" do
30
+ @category.name.should_not be_nil
31
+ end
32
+
33
+ it "should return a url" do
34
+ @category.url.should_not be_nil
35
+ end
36
+ end
37
+ end
38
+
39
+ typhoeus_spec_cache('spec/cache/categorization/get_categorization_from_html') do |hydra|
40
+ describe "#get_categorization_from_html" do
41
+ before(:each) do
42
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
43
+ @html = fixture_for('bp_spill.html')
44
+ @category = AlchemyApi::Categorization.
45
+ get_categorization_from_html(@html, :url => @url,
46
+ :source_text => 'cleaned_or_raw')
47
+ end
48
+
49
+ it "should return a category" do
50
+ @category.name.should_not be_nil
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,72 @@
1
+ require File.dirname(__FILE__) + "/../spec_helper"
2
+
3
+ describe AlchemyApi::LanguageDetection do
4
+ shared_examples_for 'a language detector' do
5
+ it "should get a language" do
6
+ @result.language.should_not be_nil
7
+ end
8
+
9
+ it "should get ISO codes" do
10
+ @result.iso_639_1.should_not be_nil
11
+ @result.iso_639_2.should_not be_nil
12
+ @result.iso_639_3.should_not be_nil
13
+ end
14
+
15
+ it "should get the ethnologue URL" do
16
+ @result.ethnologue_url.should_not be_nil
17
+ end
18
+
19
+ it "should get a native speaker count" do
20
+ @result.native_speakers.should_not be_nil
21
+ end
22
+
23
+ it "should get a wikipedia URL" do
24
+ @result.wikipedia_url.should_not be_nil
25
+ end
26
+ end
27
+
28
+ typhoeus_spec_cache('spec/cache/language_detection/get_language_from_url') do |hydra|
29
+ describe "#get_language_from_url" do
30
+ before(:each) do
31
+ @url = 'http://www.humboldtbrews.com/2010_index_music.htm'
32
+ @result = AlchemyApi::LanguageDetection.
33
+ get_language_from_url(@url,
34
+ :source_text => 'cleaned_or_raw')
35
+ end
36
+
37
+ it_should_behave_like 'a language detector'
38
+
39
+ it "should get the URL" do
40
+ @result.url.should == @url
41
+ end
42
+ end
43
+ end
44
+
45
+ typhoeus_spec_cache('spec/cache/language_detection/get_language_from_text') do |hydra|
46
+ describe "#get_language_from_text" do
47
+ before(:each) do
48
+ @url = "http://test.com"
49
+ text = fixture_for('article.txt')
50
+ @result = AlchemyApi::LanguageDetection.
51
+ get_language_from_text(text,
52
+ :url => @url)
53
+ end
54
+
55
+ it_should_behave_like 'a language detector'
56
+ end
57
+ end
58
+
59
+ typhoeus_spec_cache('spec/cache/language_detection/get_language_from_html') do |hydra|
60
+ describe "#get_language_from_html" do
61
+ before(:each) do
62
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
63
+ @html = fixture_for('bp_spill.html')
64
+ @result = AlchemyApi::LanguageDetection.
65
+ get_language_from_html(@html, :url => @url,
66
+ :source_text => 'cleaned_or_raw')
67
+ end
68
+
69
+ it_should_behave_like 'a language detector'
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,70 @@
1
+ require File.dirname(__FILE__) + "/../spec_helper"
2
+
3
+ describe AlchemyApi::TermExtraction do
4
+ shared_examples_for 'a keyword ranker' do
5
+ it "should return source text" do
6
+ @result.source_text.should_not be_nil
7
+ end
8
+
9
+ it "should return 5 keywords" do
10
+ @result.keywords.should have(5).things
11
+ end
12
+
13
+ it "should have relevance scores for the keywords" do
14
+ @result.keywords.each do |kw|
15
+ kw.relevance.should >= 0.0
16
+ kw.relevance.should <= 1.0
17
+ end
18
+ end
19
+ end
20
+
21
+ typhoeus_spec_cache('spec/cache/term_extraction/get_ranked_keywords_from_html') do |hydra|
22
+ describe "#get_ranked_keywords_from_html" do
23
+ before(:each) do
24
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
25
+ @html = fixture_for('bp_spill.html')
26
+ @result = AlchemyApi::TermExtraction.
27
+ get_ranked_keywords_from_html(@html,
28
+ :url => @url,
29
+ :max_retrieve => 5,
30
+ :show_source_text => true)
31
+ end
32
+
33
+ it_should_behave_like 'a keyword ranker'
34
+ end
35
+ end
36
+
37
+ typhoeus_spec_cache('spec/cache/term_extraction/get_ranked_keywords_from_text') do |hydra|
38
+ describe "#get_ranked_keywords_from_text" do
39
+ before(:each) do
40
+ @url = "http://test.com"
41
+ text = fixture_for('article.txt')
42
+ @result = AlchemyApi::TermExtraction.
43
+ get_ranked_keywords_from_text(text,
44
+ :url => @url,
45
+ :max_retrieve => 5,
46
+ :show_source_text => true)
47
+ end
48
+
49
+ it_should_behave_like 'a keyword ranker'
50
+ end
51
+ end
52
+
53
+ typhoeus_spec_cache('spec/cache/term_extraction/get_ranked_keywords_from_url') do |hydra|
54
+ describe "#get_ranked_keywords_from_url" do
55
+ before(:each) do
56
+ @url = 'http://www.businessweek.com/news/2010-05-02/bp-spill-threatens-gulf-of-mexico-oil-gas-operations-update1-.html'
57
+ @result = AlchemyApi::TermExtraction.
58
+ get_ranked_keywords_from_url(@url,
59
+ :max_retrieve => 5,
60
+ :show_source_text => true)
61
+ end
62
+
63
+ it "should return the given URL" do
64
+ @result.url.should == @url
65
+ end
66
+
67
+ it_should_behave_like 'a keyword ranker'
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,94 @@
1
+ require File.dirname(__FILE__) + "/../spec_helper"
2
+
3
+ describe AlchemyApi::TextExtraction do
4
+ typhoeus_spec_cache('spec/cache/text_extraction/get_text_from_url') do |hydra|
5
+ describe "#get_text_from_url" do
6
+ it "should extract text" do
7
+ url = "http://www.chron.com/disp/story.mpl/business/6981685.html"
8
+ result = AlchemyApi::TextExtraction.get_text_from_url(url)
9
+
10
+ result.url.should == url
11
+ result.text.should_not be_empty
12
+ end
13
+ end
14
+ end
15
+
16
+ typhoeus_spec_cache('spec/cache/text_extraction/get_raw_text_from_url') do |hydra|
17
+ describe "#get_raw_text_from_url" do
18
+ before(:each) do
19
+ @url = "http://www.chron.com/disp/story.mpl/business/6981685.html"
20
+ @result = AlchemyApi::TextExtraction.get_raw_text_from_url(@url)
21
+ end
22
+
23
+ it "should extract url" do
24
+ @result.url.should == @url
25
+ end
26
+
27
+ it "should get text back" do
28
+ @result.text.should_not be_empty
29
+ end
30
+ end
31
+ end
32
+
33
+ typhoeus_spec_cache('spec/cache/text_extraction/get_title_from_url') do |hydra|
34
+ describe "#get_title_from_url" do
35
+ before(:each) do
36
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
37
+ @result = AlchemyApi::TextExtraction.get_title_from_url(@url)
38
+ end
39
+
40
+ it "should extract url" do
41
+ @result.url.should == @url
42
+ end
43
+
44
+ it "should get text back" do
45
+ @result.title.should =~ /BP Spill/
46
+ end
47
+ end
48
+ end
49
+
50
+ typhoeus_spec_cache('spec/cache/text_extraction/get_title_from_html') do |hydra|
51
+ describe "#get_title_from_html" do
52
+ before(:each) do
53
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
54
+ @html = fixture_for('bp_spill.html')
55
+ @result = AlchemyApi::TextExtraction.
56
+ get_title_from_html(@html, :url => @url)
57
+ end
58
+
59
+ it "should get title back" do
60
+ @result.title.should =~ /BP Spill/
61
+ end
62
+ end
63
+ end
64
+
65
+ typhoeus_spec_cache('spec/cache/text_extraction/get_raw_text_from_html') do |hydra|
66
+ describe "#get_raw_text_from_html" do
67
+ before(:each) do
68
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
69
+ @html = fixture_for('bp_spill.html')
70
+ @result = AlchemyApi::TextExtraction.
71
+ get_raw_text_from_html(@html, :url => @url)
72
+ end
73
+
74
+ it "should get text back" do
75
+ @result.text.should_not be_empty
76
+ end
77
+ end
78
+ end
79
+
80
+ typhoeus_spec_cache('spec/cache/text_extraction/get_text_from_html') do |hydra|
81
+ describe "#get_text_from_html" do
82
+ before(:each) do
83
+ @url = "http://www.businessweek.com/news/2010-04-29/bp-spill-may-alter-obama-s-offshore-drilling-plans-update1-.html"
84
+ @html = fixture_for('bp_spill.html')
85
+ @result = AlchemyApi::TextExtraction.
86
+ get_text_from_html(@html, :url => @url)
87
+ end
88
+
89
+ it "should get text back" do
90
+ @result.text.should_not be_empty
91
+ end
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,10 @@
1
+ require File.dirname(__FILE__) + '/spec_helper'
2
+
3
+ describe AlchemyApi do
4
+ describe "#api_key" do
5
+ it "should be settable" do
6
+ AlchemyApi.api_key = "fdsa"
7
+ AlchemyApi.api_key.should == 'fdsa'
8
+ end
9
+ end
10
+ end