RubyGems - diffbot_simple - Versions diffs - 0.0.4 → 1.0.0 - Mend

diffbot_simple 0.0.4 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +4 -4
data/README.md +42 -20
data/lib/diffbot_simple/symbolize.rb +11 -8
data/lib/diffbot_simple/v2/analyze.rb +1 -6
data/lib/diffbot_simple/v2/api_helper.rb +8 -2
data/lib/diffbot_simple/v2/article.rb +1 -1
data/lib/diffbot_simple/v2/bulk.rb +11 -0
data/lib/diffbot_simple/v2/bulk_api.rb +8 -0
data/lib/diffbot_simple/v2/client.rb +13 -6
data/lib/diffbot_simple/v2/crawl.rb +53 -0
data/lib/diffbot_simple/v2/crawlbot_api.rb +22 -0
data/lib/diffbot_simple/v2/custom.rb +1 -5
data/lib/diffbot_simple/v2/image.rb +0 -4
data/lib/diffbot_simple/v2/product.rb +0 -4
data/lib/diffbot_simple/version.rb +1 -1
data/lib/diffbot_simple.rb +4 -1
data/spec/serialize_test_data.json +38 -0
data/spec/symbolize_spec.rb +17 -0
data/spec/{analyze_spec.rb → v2/analyze_spec.rb} +4 -4
data/spec/v2/article_spec.rb +12 -12
data/spec/v2/bulk_api_spec.rb +54 -0
data/spec/v2/bulk_spec.rb +56 -0
data/spec/v2/client_spec.rb +35 -4
data/spec/v2/crawl_spec.rb +66 -0
data/spec/v2/crawlbot_api_spec.rb +54 -0
data/spec/v2/custom_spec.rb +4 -4
data/spec/{image_spec.rb → v2/image_spec.rb} +4 -4
data/spec/{product_spec.rb → v2/product_spec.rb} +4 -4
metadata +24 -11
data/lib/diffbot_simple/v2/crawlbot.rb +0 -75
data/spec/v2/crawlbot_spec.rb +0 -113

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 5858d1e4c275759f39cdb4ec782fa2e7e69e18bd
-  data.tar.gz: 62eab98cb5df667fa5d47f7bcee6968774a3f14c
+  metadata.gz: ddb7b5f429829cc472112429600c1456fd5ba83e
+  data.tar.gz: 2ce6e971d8f396f5e0460dba844a3885fc4a55bd
 SHA512:
-  metadata.gz: 10a2d414f7332febb4b96b4fb0e394463ee1f42b36e93cb328282e6015020d7f302707b33da354e410783322248a70f9d2fb22f4e5ade402bcce2cd5d1cfa64a
-  data.tar.gz: 8804c3d2c4371cdf4757a7e36d09b015e4539b8a0e22cb1fa255f60252574fd4d7d76fbae34144e0393413d37d4ff30ca4012b22039c28f2dda47bbebeb339c2
+  metadata.gz: 2d90e57dfe1ee1b4fdf8380f4c3d29e293d0034b57dbf6984e76ed1bc64bed7c5965d3f8974be492544676836ca5269886c5870032a60cc30680d1fd04aa0d0b
+  data.tar.gz: 78206846dde0211f4f6d9e701caf3a3b8a5ff152b3981716622ac0c2d36535e746d7f8ec60868b674399bf079fb79d60d8d122e2bdbdd6dfa0e6a889b1642600

data/README.md CHANGED Viewed

@@ -8,8 +8,10 @@ DiffbotSimple
 A simple, nothing-fancy, helper for the [Diffbot API](http://www.diffbot.com/).
-Will not objectify any responses, just pass on the json data as hash with symbolized keys.
-One exception to that rule, when using CrawlBot and requesting a single_crawl, it will return the single item in the :jobs-array, and when requesting all, it will return the array in :jobs.
+Will not objectify any responses, however Bulk and CrawlBot are a bit wrapped.
+For these two apis, it will not care about the success-message and will only take care of things in the `:jobs`-array
+For the other apis, just pass on the json data as hash with symbolized keys to the request.
 Send options to the api as named args, se usage below with article and fields-argument.
 ## Installation
@@ -33,7 +35,7 @@ client = DiffbotSimple::V2::Client.new token: token
 article = client.article
 url = "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/"
 # Pass on diffbot parameters as options to the call
-diffbot_response_as_symbolized_hash = article.single_article url: url, fields: "icon,title"
+diffbot_response_as_symbolized_hash = article.request url: url, fields: "icon,title"
 # =>
 {
   icon: "http://www.xconomy.com/wordpress/wp-content/themes/xconomy/images/favicon.ico",
@@ -56,39 +58,60 @@ client = DiffbotSimple::V2::Client.new token: token
 url = "http://some_url_to_check"
 # Custom API
+# will raise error if not exists, must create at http://www.diffbot.com/dev/customize/
 custom = client.custom name: "my_custom_api_name"
-response = custom.single_custom url: url
+response = custom.request url: url
 # Analyze API (beta)
-analysis = client.analyze
-response = analyze.single_analysis url: url
+analyze = client.analyze
+response = analyze.request url: url
 # Article API
 article = client.article
-response = article.single_article url: url
+response = article.request url: url
 # Image API
 image = client.image
-response = image.single_image url: url
+response = image.request url: url
 # Product API
 product = client.product
-response = product.single_product url: url
+response = product.request url: url
 # Crawlbot API
-crawlbot = client.crawlbot
-all_my_crawls = crawlbot.all
-current_settings = crawlbot.single_crawl name: "my_crawl"
+all_my_crawls = client.crawl
+crawl = client.crawl name: "mycrawl"
+current_parameters = crawl.parameters
 # shorthand for using apiUrl, use the api object from client,
 # it will create a correct value for you
 # (custom, image, article, product or analyze for automatic)
-# A call to single_crawl will create if not exists or update settings
-settings = crawlbot.single_crawl name: "my_new_crawl", onlyProcessIfNew: 0, seeds: "http://www.upptec.se", apiUrl: custom
-crawlbot.pause name: "my_new_crawl"
-crawlbot.unpause name: "my_new_crawl"
-crawlbot.restart name: "my_new_crawl"
-result = crawlbot.result "my_new_crawl" # shorthand for downloading the json that are specifed in :downloadJson
-crawlbot.delete name: "my_new_crawl"
+crawl.apiUrl = product # the object from above
+# A call to client.crawl name: "mycrawl" will create if not exists
+# (works with a symbol too, client.crawl name: :mycrawl)
+# To update parameters:
+craw.update onlyProcessIfNew: 0, seeds: "http://www.upptec.se", apiUrl: custom
+# or by method, works only on loaded parameters
+crawl.onlyProcessIfNew = 0 # sends update immediatly to diffbot
+crawl.seeds = "http://www.upptec.se" # sends update immediatly to diffbot
+# direct access by name to:
+current_seeds = crawl.seeds
+# actions:
+crawl.pause
+crawl.unpause
+crawl.restart
+# results is shorthand for downloading the json that are specifed in :downloadJson
+results = crawl.results
+crawl.delete!
+# Bulk API
+# is based on crawlbot and works exactly the same
+all_my_bulk_jobs = client.bulk
+bulk = client.bulk name: "mycrawl"
+current_parameters = bulk.parameters
+# and so forth as crawlbot above.
+# however, you can add urls to process as an array using the #process method:
+bulk.process ["http://foo.bar", "http://bar.foo"]
 ```
 ### On error
@@ -96,7 +119,6 @@ If Diffbot returns an error, it will raise and fill `DiffbotSimple::V2::DiffbotE
 ## TODO
 * Frontpage API
-* Bulk API
 * Async http fetching
 * Batch API

data/lib/diffbot_simple/symbolize.rb CHANGED Viewed

@@ -1,29 +1,32 @@
 module DiffbotSimple
 	module Symbolize
-		private
-    def y_combinator(&f)
-      lambda do |g|
-        f.call {|*args| g[g][*args]}
-      end.tap {|g| break g[g]}
-    end
     def symbolize hash
     	return hash unless hash.kind_of? Hash or hash.kind_of? Array
       sym_hash = y_combinator do |&f|
       	lambda do |h|
 	      	if h.kind_of? Array
 	      		h.map {|r| f.call(r)}
-	      	else
+	      	elsif h.kind_of? Hash
 	        	h.reduce({}) do |memo,(k,v)|
 	          	v = f.call(v) if v.kind_of? Hash
 	        		v = v.map {|u| f.call(u)} if v.kind_of? Array
 	        		memo[k.to_sym] = v
 	        		memo
 	        	end
+	        else
+	        	h
 		    	end
 	    	end
 	    end
   	  sym_hash.call hash
   	end
+  	private
+    def y_combinator(&f)
+      lambda do |g|
+        f.call {|*args| g[g][*args]}
+      end.tap {|g| break g[g]}
+    end
+  	module_function :symbolize, :y_combinator
 	end
 end

data/lib/diffbot_simple/v2/analyze.rb CHANGED Viewed

@@ -5,14 +5,9 @@ module DiffbotSimple::V2
 		def post_initialize
 			@api = :analyze
 		end
-		def to_crawl_api_url
+		def to_api_url
 			default = super
 			"#{default}?mode=auto"
 		end
-		def single_analysis url: nil, **options
-			raise ArgumentError.new "Must pass an url to fetch" unless url
-			execute_call options.merge(url: url)
-		end
-		alias :single_analyze :single_analysis
 	end
 end

data/lib/diffbot_simple/v2/api_helper.rb CHANGED Viewed

@@ -9,9 +9,14 @@ module DiffbotSimple::V2
 		def post_initialize
 			raise "Must overload to set api path"
 		end
-		def to_crawl_api_url
+		def to_api_url
 			"#{api_client.site}#{api}"
 		end
+		# overload if necessary
+		def request url: nil, **options
+			raise ArgumentError.new "Must pass an url for the request to work" unless url
+			execute_call options.merge(url: url)
+		end
 		private
 		attr_reader :token, :api_client, :api
 		def execute_call custom_headers: nil, method: :get, payload: nil, **options
@@ -33,7 +38,8 @@ module DiffbotSimple::V2
 			merged
 		end
 		def expand_api_url api_url
-			api_url.to_crawl_api_url if api_url.respond_to?(:to_crawl_api_url)
+			return api_url.to_api_url if api_url.respond_to?(:to_api_url)
+			return api_url
 		end
 		def raise_if_error_response result_from_diffbot
 			return unless result_from_diffbot[:error]

data/lib/diffbot_simple/v2/article.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module DiffbotSimple::V2
 		def post_initialize
 			@api = :article
 		end
-		def single_article url: nil, custom_headers: nil, body: nil, **options
+		def request url: nil, custom_headers: nil, body: nil, **options
 			raise ArgumentError.new "Must pass an url for the article api to fetch" unless url
 			if body
 				custom_headers ||= {}

data/lib/diffbot_simple/v2/bulk.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module DiffbotSimple::V2
+	class Bulk < Crawl
+		def initialize bulk_api: nil, name: nil, init: {}, **parameters
+			super parameters.merge(name: name, crawlbot_api: bulk_api, init: init)
+		end
+		def process urls_to_process
+			urls_to_process = [urls_to_process] unless urls_to_process.respond_to? :join
+			send_to_api urls: urls_to_process.join(" ")
+		end
+	end
+end

data/lib/diffbot_simple/v2/bulk_api.rb ADDED Viewed

@@ -0,0 +1,8 @@
+module DiffbotSimple::V2
+	# Complies to http://www.diffbot.com/dev/docs/bulk/
+	class BulkApi < CrawlbotApi
+		def post_initialize
+			@api = :bulk
+		end
+	end
+end

data/lib/diffbot_simple/v2/client.rb CHANGED Viewed

@@ -1,13 +1,12 @@
 require_relative 'api_client'
 module DiffbotSimple::V2
 	class Client
-		def initialize token: nil
+		def initialize token: nil, bulk_api: nil, api_client: nil, crawlbot_api: nil
 			raise ArgumentError.new("Must supply developer token") if token.to_s.empty?
 			@token = token
-			@api_client = ApiClient.new
-		end
-		def crawlbot
-			Crawlbot.new api_client: api_client, token: token
+			@api_client = api_client ||= ApiClient.new
+			@bulk_api = bulk_api ||= BulkApi.new(api_client: api_client, token: token)
+			@crawlbot_api = crawlbot_api ||= CrawlbotApi.new(api_client: api_client, token: token)
 		end
 		def article
 			Article.new api_client: api_client, token: token
@@ -24,7 +23,15 @@ module DiffbotSimple::V2
 		def analyze
 			Analyze.new api_client: api_client, token: token
 		end
+		def bulk name: nil
+			return bulk_api.all.map { |e| Bulk.new name: e.delete(:name), init: e, bulk_api: bulk_api  } unless name
+			return Bulk.new name: name, bulk_api: bulk_api
+		end
+		def crawl name: nil
+			return crawlbot_api.all.map { |e| Crawl.new name: e.delete(:name), init: e, crawlbot_api: crawlbot_api  } unless name
+			return Crawl.new name: name, crawlbot_api: crawlbot_api
+		end
 		private
-		attr_reader :token, :api_client
+		attr_reader :token, :api_client, :bulk_api, :crawlbot_api
 	end
 end

data/lib/diffbot_simple/v2/crawl.rb ADDED Viewed

@@ -0,0 +1,53 @@
+module DiffbotSimple::V2
+	class Crawl
+		attr_reader :parameters, :name
+		def initialize crawlbot_api: nil, name: nil, init: {}, **parameters
+			@crawlbot_api = crawlbot_api
+			@name = name
+		 	if init.empty?
+				send_to_api parameters
+			else
+				@parameters = init
+			end
+		end
+		def pause
+			send_to_api pause: 1
+		end
+		def unpause
+			send_to_api pause: 0
+		end
+		def delete!
+			send_to_api delete: 1
+			@parameters = {}
+		end
+		def restart
+			send_to_api restart: 1
+		end
+		def update **parameters
+			send_to_api parameters
+		end
+		def results
+			crawlbot_api.results url: parameters[:downloadJson]
+		end
+		def refresh
+			send_to_api
+		end
+		def apiUrl= api_url
+			send_to_api apiUrl: api_url
+		end
+		def method_missing property, *args
+			key = property.to_s.gsub(/\=$/,"").to_sym
+			super unless parameters.has_key? key
+			return send_to_api({ key => args.join(",") }) if property.to_s.match(/\=$/) or !args.empty?
+			return parameters[key]
+		end
+		private
+		attr_reader :crawlbot_api
+		def send_to_api **options
+			params = options.merge({name: name})
+			@parameters = crawlbot_api.single params
+			@parameters.delete :name
+			self
+		end
+	end
+end

data/lib/diffbot_simple/v2/crawlbot_api.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module DiffbotSimple::V2
+	# Complies to http://www.diffbot.com/dev/docs/crawlboy/
+	class CrawlbotApi
+		include ApiHelper
+		def post_initialize
+			@api = :crawl
+		end
+		def all
+			execute_call()[:jobs].select { |e| e[:type] == @api.to_s  }
+		end
+		def single name: nil, **options
+			response = execute_call options.merge(name: name)
+			return response[:jobs].select { |e| e[:type] == @api.to_s }.first if response.has_key?(:jobs)
+			response
+		end
+		def results url: nil
+			return [] unless url
+			response = api_client.get url
+			symbolize response
+		end
+	end
+end

data/lib/diffbot_simple/v2/custom.rb CHANGED Viewed

@@ -5,15 +5,11 @@ module DiffbotSimple::V2
 		attr_reader :name
 		def initialize name: nil, **options
 			raise ArgumentError.new "Must pass a name for the custom api" unless name
-			@name = name
+			@name = name.to_s
 			super options
 		end
 		def post_initialize
 			@api = "api/#{CGI::escape(name)}"
 		end
-		def single_custom url: nil, **options
-			raise ArgumentError.new "Must pass an url for the custom api to fetch" unless url
-			execute_call options.merge(url: url)
-		end
 	end
 end

data/lib/diffbot_simple/v2/image.rb CHANGED Viewed

@@ -5,9 +5,5 @@ module DiffbotSimple::V2
 		def post_initialize
 			@api = :image
 		end
-		def single_image url: nil, **options
-			raise ArgumentError.new "Must pass an url to fetch" unless url
-			execute_call options.merge(url: url)
-		end
 	end
 end

data/lib/diffbot_simple/v2/product.rb CHANGED Viewed

@@ -5,9 +5,5 @@ module DiffbotSimple::V2
 		def post_initialize
 			@api = :product
 		end
-		def single_product url: nil, **options
-			raise ArgumentError.new "Must pass an url to fetch" unless url
-			execute_call options.merge(url: url)
-		end
 	end
 end

data/lib/diffbot_simple/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module DiffbotSimple
-  VERSION = "0.0.4"
+  VERSION = "1.0.0"
 end

data/lib/diffbot_simple.rb CHANGED Viewed

@@ -3,12 +3,15 @@ require 'diffbot_simple/symbolize'
 require 'diffbot_simple/v2/diffbot_error'
 require 'diffbot_simple/v2/api_helper'
 require 'diffbot_simple/v2/client'
-require 'diffbot_simple/v2/crawlbot'
+require 'diffbot_simple/v2/crawlbot_api'
+require 'diffbot_simple/v2/crawl'
 require 'diffbot_simple/v2/article'
 require 'diffbot_simple/v2/custom'
 require 'diffbot_simple/v2/product'
 require 'diffbot_simple/v2/image'
 require 'diffbot_simple/v2/analyze'
+require 'diffbot_simple/v2/bulk_api'
+require 'diffbot_simple/v2/bulk'
 module DiffbotSimple
 end

data/spec/serialize_test_data.json ADDED Viewed

@@ -0,0 +1,38 @@
+{
+  "icon": "http://www.xconomy.com/wordpress/wp-content/themes/xconomy/images/favicon.ico",
+  "author": "Wade Roush",
+  "text": "You know how the Picturephone , a half-billion-dollar project at AT&T back in the 1960s and 1970s, turned out to be a huge commercial flop, but two-way video communication eventually came back with a vengeance in the form of Skype and FaceTime and Google Hangouts? Well, something similar is going on with the Semantic Web.\n\nThat’s the proposal, dating back almost to the invention of the Web in the 1990s, that the various parts of Web pages should be tagged so that machines, as well as people, can make inferences based on the information they contain. The idea has never gotten very far, mainly because the burden of tagging all that content would fall to humans, which makes it expensive and tedious. But now it looks like the original goal of making digital content more comprehensible to computers might be achievable at far lower cost, thanks to better software.\n\nDiffbot  is building that software. This unusual startup—the first ever to emerge from the Stanford-based accelerator , back in 2009—is using computer vision technology similar to that used for robotics applications such as self-driving cars to classify the parts of Web pages so that they can be reassembled in other forms. AOL is one of the startup’s first big customers and its landlord. It’s using Diffbot’s technology to assemble Editions by AOL , the personalized, iPad-based magazine comprised of content culled from AOL properties like the Huffington Post, TechCrunch, and Engadget.\n\n\nI went down to AOL’s Palo Alto campus last month to meet the company’s founder and CEO Mike Tung and its vice president of products John Davi. They didn’t deliberately set out to solve the Semantic Web problem, any more than the founders of Skype set out to build an affordable Picturephone. But their venture, which has attracted about $2 million  in backing from Andy Bechtolsheim and a raft of other angel investing stars, is already on its way to creating one of the world’s largest structured indexes of unstructured Web content.\n\nWithout relying on HTML tags (which can actually be used to trick traditional Web crawling software), Diffbot can look at a news page and tell what’s a headline, what’s a byline, where the article text begins and ends, what’s an advertisement, and so forth. What practical use can companies make of that, and where’s the profit in it for Diffbot? Well, aside from AOL, the startup’s software is already being used in some interesting places: reading app maker  uses it to extract article text from websites, and content discovery service  employs it to screen out spam.\n\nIn fact, companies pay Diffbot to analyze more than 100 million unique URLs per month. And that’s just the beginning. Building outward from its early focus on news articles, the startup is creating new algorithms that could make sense of many kinds of sites, such as e-commerce catalogs. The individual elements of those sites could then be served up in almost any context. Imagine a Siri for shopping, to take just one example. “We’re building a series of wedges that will add up to a complete view of the Web,” says Davi. “We are excited about having them all under our belt, so there can be a fully indexed, reverse-engineered Semantic Web.”\n\nWhat follows is a highly compressed version of my conversation with Tung and Davi.\n\nXconomy:  Where did you guys meet, and how did you end up working on Diffbot?\n\nMike Tung:  I worked at Microsoft on Windows Vista right out of high school, then went to college at Cal and studied electrical engineering for two years, then went to Stanford to start a PhD in computer science, specializing in AI. When I first moved to Silicon Valley, I also worked at a bunch of startups. I was engineer number four at TheFind, which was a product search company that built the world’s largest product index. I worked on search at Yahoo and eBay, and also did a bunch of contract work. I took the patent bar and worked as a patent lawyer for a couple of years, writing 3G and 4G patents for Panasonic and Matsushita. I first met John when we were working at a startup called ClickTV, which was a video-player-search-engine thing. It was pretty advanced for its time.\n\nDiffbot began when I was in grad school at Stanford [in 2005]. There was this one quarter where I was taking a lot of classes, so I made this tool for myself to keep track of all of them. I would put in the URL for the class website, and whenever a professor would upload new slides or content, Diffbot would find that and download it to my phone. I always felt like I knew what was going on in my classes without having to attend every single one.\n\nIt was useful, and my friends started asking me whether they could use it. So I turned it into a Web service and \n\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t    Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can subscribe to his Google Group  or e-mail him at wroush@xconomy.com .\n(Page 2 of 4) \n\n\t\t\tstarted running it out of a dorm at Stanford. And people started adding a bunch of different kinds of URLs to Diffbot outside of classes, like they might add Craigslist if they were searching for a job or a product, or Facebook if they wanted to see if their ex’s profile had changed.\n\nX:  So I assume the name “Diffbot” related to comparing the old and new versions of a website and detecting the differences?\n\nMT:  Yes, but just doing deltas on Web pages doesn’t work too well. It turns out that on the modern Web, every page refresh changes the ads and the counters. You have to be a little more intelligent.\n\nThat’s where understanding the page comes into play. I was studying machine learning at Stanford, and in particular one project I had worked on was the vision system for the self-driving car [Stanford’s entry in the 2007 DARPA Urban Challenge]. This was the stereo camera system that would compute the depth of a scene and say, ‘This is a cactus, this is drivable dirt, this is not drivable dirt, this is a cliff, this is a very narrow passageway.’ I realized that one way of making Diffbot generalizable was to apply computer vision to Web pages. Not to say, ‘This is a cactus and this is a pedestrian,’ but to say, ‘This is an advertisement and this is a footer and this is a product.’\n\nA human being can look at Web page and very easily tell what type of page it is without even looking at the text, and that is what we are teaching Diffbot to do. The goal is to build a machine-readable version of the entire Web.\n\nX:  Isn’t that what Tim Berners-Lee has been talking about for years—building a Semantic Web that’s machine-readable?\n\nMT:  It seems that every three years or so a new Semantic Web technology gets hyped up again. There was RSS, RDF, OWL, and now it’s Open Graph and the Knowledge Graph. The central problem—why none of these have really gone mainstream—is that you are requiring humans to tag the content twice, once for the machine’s benefit and once for the actual humans. Because you are placing so much onus on the content creators, you are never going to have all of the content in any given system. So it will be fragmented into different Semantic Web file formats, and because of that you will never have an app that allows you to search and evaluate all that information.\n\nBut what if you analyze the page itself? That is where we have an opportunity, by applying computer vision to eliminate the problem of manual tagging. And we have reached a certain point in the technology continuum where it is actually possible—where the CPUs are fast enough and the machine learning technology is good enough that we have a good shot of doing it with high accuracy.\n\nX:  Why are you so convinced that a human-tagged Semantic Web would never work?\n\nMT:  The number one point is that people are lazy. The second is that people lie. Google used to read the meta tags and keywords at the top of a Web page, and so people would start stuffing those areas with everything. It didn’t correspond to what actual humans saw. The same thing holds for Semantic Web formats. Whenever you have things indexed separately, you start to see spam. By using a robot to look at the page, you are keeping it above that.\n\nX:  Talk about the computer vision aspect of Diffbot. How literal is the comparison to the cameras and radar on robot cars?\n\nMT:  We use the very same techniques used in computer vision, for example object detection and edge detection. If you are a customer, you give us a URL to analyze. We render the page using a virtual Webkit browser in the cloud. It will render the page, run the Javascript, and lay everything out with the CSS rules and everything. Then we have these hooks into Webkit that \n\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t    Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can subscribe to his Google Group  or e-mail him at wroush@xconomy.com .\n(Page 3 of 4) \n\n\t\t\tallow us to get all of the visual and geometric information out of the page. For every rectangle, we pull out things like the x and y coordinates, the heights and widths, the positioning relative to everything else, the font sizes, the colors, and other visual cues. In much the same way, when I was working on the self-driving car, we would look at a patch and do edge detection to determine the shape of a thing or find the horizon.\n\nX:  Once you identify those shapes and other elements, how do you say, “This is a headline, this is an article,” et cetera?\n\nMT:  We have an ontology. Other people have done good work defining what those ontologies should be—there are many of them at schema.org, which reflects what the search engines have proposed as ontologies. We also have human beings who draw rectangles on the pages and teach Diffbot “this is what an author field looks like, this is what a product looks like, this is what a price looks like,” and from those rectangles we can generalize. It’s a machine learning system, so it lives and breathes on the training data that is fed into it.\n\nX:  Do you actually do all the training work yourselves, or do you crowdsource it out somehow?\n\nJohn Davi:  We have done a combination of things. We always have a cold-start problem firing up new type of pages—products versus articles, or a new algorithm for press releases, for example. We leverage both grunt work internally—just grinding out our own examples, which has the side benefit of keeping us informed about the real world—but yeah, also crowdsourcing, which gives us a much broader variety of input and opinion. We have used everything, including off-the-shelf crowdsourcing tools like Mechanical Turk and Crowdflower, and we have build up our own group of quasi-contract crowdsourcers.\n\nOur basic effort is to cold-start it ourselves, then get an alpha-level product into the hands of our customer, which will then drastically increase the amount of training data we have. Sometimes we look at the stream of content and eyeball it and manually tweak and correct. In a lot of cases our customer gets involved. If they have an interest in helping to train the algorithm—it not only makes it better for them, but if they are first out of the gate they can tailor the algorithm to their very particular needs.\n\nX:  How much can your algorithms tell about a Web page just from the way it looks? Are you also analyzing the actual text?\n\nMT:  First we take a URL and determine what type of page it is. We’ve identified roughly 20 types of pages that all the Web can fall into. Article pages, people pages, product pages, photos, videos, and so on. So one of the fields we return will be what is the type of this thing. Then, depending on the type, there are other fields. For the article API [application programming interface], which is one we have out publicly, we can tell you the title, the author, the images, the videos, and the text that go with that article. And we not only identify where the text is, but we can tell you the topics. We do some natural language processing on the text and we can tell you “This is about Apple,” and we can tell it’s about Apple Computer and not the fruit.\n\nJD:  Another opportunity we are excited about his how Diffbot can help augment what is natively on the page. Just by dint of following so many pages through our system, we can augment [the existing formatting] and increase the value for whoever is reading. In the case of an article, the fact that we see so many articles means it’s relatively easy for us to generate tags for any given text.\n\nX:  How do you turn this all into a business?\n\nMT:  We are actually selling something. We are trying to build the Semantic Web, but in a profitable way. We analyze the pages that people pay us to analyze. That’s currently over 100 million URLs per month, which is a good slice of the Web. Other startups have taken the approach of starting by crawling and indexing the Web, and that is very capital-intensive. By doing it this way, another benefit is that people only send us the best parts of \n\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t    Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can subscribe to his Google Group  or e-mail him at wroush@xconomy.com .\n(Page 4 of 4) \n\n\t\t\tthe Web. Most of the stuff a typical Web crawler goes through never appears in any search results. Most of the Web is crap.\n\nX:  Are people finding uses for the technology that you may not have thought of?\n\nMT:  We had a hackathon last year where a guy came in and built an app for his father, who is blind. It runs Diffbot on a page and makes it into a radio station. For someone who is blind, browsing a news site is usually a really poor experience. The usual screen readers will read the entire page, including the nav bars and the ads and the text. The screen readers have no context about what is important on the page. Using Diffbot to be his father’s eyes, this guy could parse the page and read it in a way that is much more natural.\n\nJD:  AOL’s Editions app is one of the more interesting use cases that I’ve seen. It’s an iPad app that features both their own content as well as snippets from across the Web, in a daily issue. I spent five years running engineering for the media solutions group at Cisco, selling a Web platform for media companies, and the biggest problem we faced was dealing with the excess of content management systems that all media companies have. In the case of Editions, AOL has myriad properties that they want to merge into this single app. But rather than consolidate TechCrunch and Engadget and the Huffington Post and a half dozen other sites, they use Diffbot to build a kind of content management system on the fly from the rendered Web pages. They extract the content and deliver it on the fly as if it came from a CMS right to the iPad magazine.\n\nStumbleUpon is another interesting one. They use Diffbot as their moderation queue. Whenever a new website is submitted to their index, they want to make sure it’s legitimate before it’s available for stumbling. They have to rule out people who stumble a page, then swap it out for spam. So they run Diffbot on the source page, pipe that into their moderation queue, and if it looks like a legitimate page they can monitor that and keep checking on a regular basis to see how much it changes. If it has changed much between day 1 and day 10, it might warrant human intervention.\n\nX:  Aren’t there are a lot of news reader app these days that are doing the same thing you’re doing when it comes to identifying and isolating the text of a news article? That’s what Instapaper and Pocket and Readability and Zite are all doing.\n\nMT:  We power a lot of those apps. Our audience is the developers who work at those companies, who use our API to create their experience.\n\nJD:  We make it a lot more affordable to make those kinds of forays. When you look at building your own customized extraction tools, you are talking about multiple developers over weeks or months, to build something that is more brittle than what we offer out of the gate. Our ultimate goal is to be not only better but a lot cheaper than what you could build.\n\nX:  It’s not totally clear yet, though, whether publications or apps that aggregate lots of content from elsewhere, like Editions or even Flipboard, are going to be profitable in the long term, and where publishing is going as a business. Don’t you guys feel there’s some risk in tying your fortunes to such a troubled industry?\n\nMT:  The more interesting question is how do you monetize the Semantic Web, and where is the money in building the structured information. Articles are only one page type. Another that I mentioned is products. If you could show products on a cell phone, and people could buy the product and we could make that transaction happen, that is one very tangible way of making money. I think there is a lot of value in having structured information, because you can connect people more directly to what they want. Once we have the entire Web in machine-readable format, anybody who wants to use any sort of data can use the Diffbot view of it, and I think a lot of those apps can make money. Look at Siri—it’s great but it only works with the 10 or so sources that it’s hard-coded to work with. If you were able to combine Siri with Diffbot, Siri could operate on the Web and take a query and actually do it for you.\n\nX:  What page types will you move on to next? Did you start with articles because those are easiest?\n\nMT:  I wouldn’t say they were easiest, but they are pretty prevalent on the Web. A variety of factors help us prioritize what we should do next. One signal is what is the prevalence of that type of page on the Web. If doing one page type lets us knock out 30 percent of the Web, maybe we will go for it.\n\nX:  Will there always be a need for Diffbot, or with the transition to HTML 5, will Web pages gradually get more structure on their own?\n\nMT:  If you look at the ratio of unstructured pages to structured, it’s actually going in the opposite direction. I think human beings are creative, and they design pages for other humans. No matter what, people will find a way to create documents that lie outside of the well-defined tags, whether it’s HTML 5 or Flash or PDF or Xbox. What they all have in common is that they are just vessels that we can easily train and adapt Diffbot to work with.\n\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t    Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can subscribe to his Google Group  or e-mail him at wroush@xconomy.com .",
+  "title": "Diffbot Is Using Computer Vision to Reinvent the Semantic Web",
+  "nextPages": [
+    "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/2/",
+    "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/3/",
+    "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/4/"
+  ],
+  "images": [
+    {
+      "primary": "true",
+      "url": "http://www.xconomy.com/wordpress/wp-content/images/2012/07/diffbot-reclining-220x146.jpg"
+    },
+    {
+      "caption": "NPR's top news page as interpreted by Diffbot (click for larger version)",
+      "url": "http://www.xconomy.com/wordpress/wp-content/images/2012/07/Screen-Shot-2012-07-25-at-9.13.27-AM-300x332.png"
+    },
+    {
+      "primary": "true",
+      "url": "http://static.xconomy.com/Advertisers/25df47a9f35f4e1dad18a71f750e11a6.jpg"
+    },
+    {
+      "caption": "Diffbot robot",
+      "url": "http://www.xconomy.com/wordpress/wp-content/images/2012/07/12-220x265.png"
+    },
+    {
+      "url": "http://static.xconomy.com/Advertisers/c79cd8619bfd4209bfffe1c1602cee17.jpg"
+    }
+  ],
+  "html": "<img alt=\"Diffbot\" class=\"attachment-200x9999 wp-post-image diffbot_image\" height=\"132\" src=\"http://www.xconomy.com/wordpress/wp-content/images/2012/07/diffbot-reclining-220x146.jpg\" width=\"200\"></img><div>\n\t\t\t\n\t\t\t  \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t      \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t<p>You know how the <a class=\"ext-link\" href=\"http://www.corp.att.com/attlabs/reputation/timeline/70picture.html\" onclick=\"javascript:_gaq.push(['_trackEvent','outbound-article','http://www.corp.att.com']);\" rel=\"external\" target=\"_blank\" title=\"\">Picturephone</a>, a half-billion-dollar project at AT&amp;T back in the 1960s and 1970s, turned out to be a huge commercial flop, but two-way video communication eventually came back with a vengeance in the form of Skype and FaceTime and Google Hangouts? Well, something similar is going on with the Semantic Web.</p>\n<p>That&rsquo;s the proposal, dating back almost to the invention of the Web in the 1990s, that the various parts of Web pages should be tagged so that machines, as well as people, can make inferences based on the information they contain. The idea has never gotten very far, mainly because the burden of tagging all that content would fall to humans, which makes it expensive and tedious. But now it looks like the original goal of making digital content more comprehensible to computers might be achievable at far lower cost, thanks to better software.</p>\n<p><a class=\"ext-link\" href=\"http://www.diffbot.com\" onclick=\"javascript:_gaq.push(['_trackEvent','outbound-article','http://www.diffbot.com']);\" rel=\"external\" target=\"_blank\" title=\"\">Diffbot</a> is building that software. This unusual startup&mdash;the first ever to emerge from the Stanford-based accelerator , back in 2009&mdash;is using computer vision technology similar to that used for robotics applications such as self-driving cars to classify the parts of Web pages so that they can be reassembled in other forms. AOL is one of the startup&rsquo;s first big customers and its landlord. It&rsquo;s using Diffbot&rsquo;s technology to assemble <a href=\"http://www.xconomy.com/national/2012/01/20/news-readers/2/\">Editions by AOL</a>, the personalized, iPad-based magazine comprised of content culled from AOL properties like the Huffington Post, TechCrunch, and Engadget.</p>\n\n<p>I went down to AOL&rsquo;s Palo Alto campus last month to meet the company&rsquo;s founder and CEO Mike Tung and its vice president of products John Davi. They didn&rsquo;t deliberately set out to solve the Semantic Web problem, any more than the founders of Skype set out to build an affordable Picturephone. But their venture, which <a href=\"http://www.xconomy.com/san-francisco/2012/06/01/diffbot-garners-2000000-new-round/\">has attracted about $2 million</a> in backing from Andy Bechtolsheim and a raft of other angel investing stars, is already on its way to creating one of the world&rsquo;s largest structured indexes of unstructured Web content.</p>\n<p>Without relying on HTML tags (which can actually be used to trick traditional Web crawling software), Diffbot can look at a news page and tell what&rsquo;s a headline, what&rsquo;s a byline, where the article text begins and ends, what&rsquo;s an advertisement, and so forth. What practical use can companies make of that, and where&rsquo;s the profit in it for Diffbot? Well, aside from AOL, the startup&rsquo;s software is already being used in some interesting places: reading app maker  uses it to extract article text from websites, and content discovery service  employs it to screen out spam.</p>\n<p>In fact, companies pay Diffbot to analyze more than 100 million unique URLs per month. And that&rsquo;s just the beginning. Building outward from its early focus on news articles, the startup is creating new algorithms that could make sense of many kinds of sites, such as e-commerce catalogs. The individual elements of those sites could then be served up in almost any context. Imagine a Siri for shopping, to take just one example. &ldquo;We&rsquo;re building a series of wedges that will add up to a complete view of the Web,&rdquo; says Davi. &ldquo;We are excited about having them all under our belt, so there can be a fully indexed, reverse-engineered Semantic Web.&rdquo;</p>\n<p>What follows is a highly compressed version of my conversation with Tung and Davi.</p>\n<p><strong>Xconomy:</strong> Where did you guys meet, and how did you end up working on Diffbot?</p>\n<p><strong>Mike Tung:</strong> I worked at Microsoft on Windows Vista right out of high school, then went to college at Cal and studied electrical engineering for two years, then went to Stanford to start a PhD in computer science, specializing in AI. When I first moved to Silicon Valley, I also worked at a bunch of startups. I was engineer number four at TheFind, which was a product search company that built the world&rsquo;s largest product index. I worked on search at Yahoo and eBay, and also did a bunch of contract work. I took the patent bar and worked as a patent lawyer for a couple of years, writing 3G and 4G patents for Panasonic and Matsushita. I first met John when we were working at a startup called ClickTV, which was a video-player-search-engine thing. It was pretty advanced for its time.</p>\n<p>Diffbot began when I was in grad school at Stanford [in 2005]. There was this one quarter where I was taking a lot of classes, so I made this tool for myself to keep track of all of them. I would put in the URL for the class website, and whenever a professor would upload new slides or content, Diffbot would find that and download it to my phone. I always felt like I knew what was going on in my classes without having to attend every single one.</p>\n<p>It was useful, and my friends started asking me whether they could use it. So I turned it into a Web service and </p>\n\t\t\t\t\t\t\t\t\t\t<p class=\"authorBio clearFix\">\n\t\t\t\t    Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can <a href=\"https://groups.google.com/forum/?hl=en_US&fromgroups#!forum/waderoush\">subscribe to his Google Group</a> or e-mail him at <a href=\"mailto:wroush@xconomy.com\">wroush@xconomy.com</a>.    \t\t\t\t\t\t\t\t</p>\n\t\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t</div><img alt=\"\" class=\"size-large wp-image-198170 diffbot_image\" height=\"332\" src=\"http://www.xconomy.com/wordpress/wp-content/images/2012/07/Screen-Shot-2012-07-25-at-9.13.27-AM-300x332.png\" title=\"NPR's top news page as interpreted by Diffbot\" width=\"300\"></img><br class=\"diffbot_nextPage\"> <img alt=\"Diffbot\" class=\"attachment-200x9999 wp-post-image diffbot_image\" height=\"132\" src=\"http://www.xconomy.com/wordpress/wp-content/images/2012/07/diffbot-reclining-220x146.jpg\" width=\"200\"></img><div>\n\t\t\t\n\t\t\t  \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t      \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t<p> (Page 2 of 4) </p>\n\t\t\t<p>started running it out of a dorm at Stanford. And people started adding a bunch of different kinds of URLs to Diffbot outside of classes, like they might add Craigslist if they were searching for a job or a product, or Facebook if they wanted to see if their ex&rsquo;s profile had changed.</p>\n<p><strong>X:</strong> So I assume the name &ldquo;Diffbot&rdquo; related to comparing the old and new versions of a website and detecting the differences?</p>\n<p><strong>MT:</strong> Yes, but just doing deltas on Web pages doesn&rsquo;t work too well. It turns out that on the modern Web, every page refresh changes the ads and the counters. You have to be a little more intelligent.</p>\n<p>That&rsquo;s where understanding the page comes into play. I was studying machine learning at Stanford, and in particular one project I had worked on was the vision system for the self-driving car [Stanford&rsquo;s entry in the 2007 DARPA Urban Challenge]. This was the stereo camera system that would compute the depth of a scene and say, &lsquo;This is a cactus, this is drivable dirt, this is not drivable dirt, this is a cliff, this is a very narrow passageway.&rsquo; I realized that one way of making Diffbot generalizable was to apply computer vision to Web pages. Not to say, &lsquo;This is a cactus and this is a pedestrian,&rsquo; but to say, &lsquo;This is an advertisement and this is a footer and this is a product.&rsquo;</p>\n<p>A human being can look at Web page and very easily tell what type of page it is without even looking at the text, and that is what we are teaching Diffbot to do. The goal is to build a machine-readable version of the entire Web.</p>\n<p><strong>X:</strong> Isn&rsquo;t that what Tim Berners-Lee has been talking about for years&mdash;building a Semantic Web that&rsquo;s machine-readable?</p>\n<p><strong>MT:</strong> It seems that every three years or so a new Semantic Web technology gets hyped up again. There was RSS, RDF, OWL, and now it&rsquo;s Open Graph and the Knowledge Graph. The central problem&mdash;why none of these have really gone mainstream&mdash;is that you are requiring humans to tag the content twice, once for the machine&rsquo;s benefit and once for the actual humans. Because you are placing so much onus on the content creators, you are never going to have all of the content in any given system. So it will be fragmented into different Semantic Web file formats, and because of that you will never have an app that allows you to search and evaluate all that information.</p>\n<p>But what if you analyze the page itself? That is where we have an opportunity, by applying computer vision to eliminate the problem of manual tagging. And we have reached a certain point in the technology continuum where it is actually possible&mdash;where the CPUs are fast enough and the machine learning technology is good enough that we have a good shot of doing it with high accuracy.</p>\n<p><strong>X:</strong> Why are you so convinced that a human-tagged Semantic Web would never work?</p>\n<p><strong>MT:</strong> The number one point is that people are lazy. The second is that people lie. Google used to read the meta tags and keywords at the top of a Web page, and so people would start stuffing those areas with everything. It didn&rsquo;t correspond to what actual humans saw. The same thing holds for Semantic Web formats. Whenever you have things indexed separately, you start to see spam. By using a robot to look at the page, you are keeping it above that.</p>\n<p><strong>X:</strong> Talk about the computer vision aspect of Diffbot. How literal is the comparison to the cameras and radar on robot cars?</p>\n<p><strong>MT:</strong> We use the very same techniques used in computer vision, for example object detection and edge detection. If you are a customer, you give us a URL to analyze. We render the page using a virtual Webkit browser in the cloud. It will render the page, run the Javascript, and lay everything out with the CSS rules and everything. Then we have these hooks into Webkit that </p>\n\t\t\t\t\t\t\t\t\t\t<p class=\"authorBio clearFix\">\n\t\t\t\t    Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can <a href=\"https://groups.google.com/forum/?hl=en_US&fromgroups#!forum/waderoush\">subscribe to his Google Group</a> or e-mail him at <a href=\"mailto:wroush@xconomy.com\">wroush@xconomy.com</a>.    \t\t\t\t\t\t\t\t</p>\n\t\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t</div><br class=\"diffbot_nextPage\"> <img class=\"diffbot_image\" src=\"http://static.xconomy.com/Advertisers/25df47a9f35f4e1dad18a71f750e11a6.jpg\"></img><div>\n\t\t\t\n\t\t\t  \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t      \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t<p> (Page 3 of 4) </p>\n\t\t\t<p>allow us to get all of the visual and geometric information out of the page. For every rectangle, we pull out things like the x and y coordinates, the heights and widths, the positioning relative to everything else, the font sizes, the colors, and other visual cues. In much the same way, when I was working on the self-driving car, we would look at a patch and do edge detection to determine the shape of a thing or find the horizon.</p>\n<p><strong>X:</strong> Once you identify those shapes and other elements, how do you say, &ldquo;This is a headline, this is an article,&rdquo; et cetera?</p>\n<p><strong>MT:</strong> We have an ontology. Other people have done good work defining what those ontologies should be&mdash;there are many of them at schema.org, which reflects what the search engines have proposed as ontologies. We also have human beings who draw rectangles on the pages and teach Diffbot &ldquo;this is what an author field looks like, this is what a product looks like, this is what a price looks like,&rdquo; and from those rectangles we can generalize. It&rsquo;s a machine learning system, so it lives and breathes on the training data that is fed into it.</p>\n<p><strong>X:</strong> Do you actually do all the training work yourselves, or do you crowdsource it out somehow?</p>\n<p><strong><a href=\"http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/attachment/12-3/\" rel=\"attachment wp-att-198179\"><img alt=\"\" class=\"alignleft size-medium wp-image-198179 diffbot_image\" height=\"265\" src=\"http://www.xconomy.com/wordpress/wp-content/images/2012/07/12-220x265.png\" title=\"Diffbot robot\" width=\"220\"></img></a>John Davi:</strong> We have done a combination of things. We always have a cold-start problem firing up new type of pages&mdash;products versus articles, or a new algorithm for press releases, for example. We leverage both grunt work internally&mdash;just grinding out our own examples, which has the side benefit of keeping us informed about the real world&mdash;but yeah, also crowdsourcing, which gives us a much broader variety of input and opinion. We have used everything, including off-the-shelf crowdsourcing tools like Mechanical Turk and Crowdflower, and we have build up our own group of quasi-contract crowdsourcers.</p>\n<p>Our basic effort is to cold-start it ourselves, then get an alpha-level product into the hands of our customer, which will then drastically increase the amount of training data we have. Sometimes we look at the stream of content and eyeball it and manually tweak and correct. In a lot of cases our customer gets involved. If they have an interest in helping to train the algorithm&mdash;it not only makes it better for them, but if they are first out of the gate they can tailor the algorithm to their very particular needs.</p>\n<p><strong>X:</strong> How much can your algorithms tell about a Web page just from the way it looks? Are you also analyzing the actual text?</p>\n<p><strong>MT:</strong> First we take a URL and determine what type of page it is. We&rsquo;ve identified roughly 20 types of pages that all the Web can fall into. Article pages, people pages, product pages, photos, videos, and so on. So one of the fields we return will be what is the type of this thing. Then, depending on the type, there are other fields. For the article API [application programming interface], which is one we have out publicly, we can tell you the title, the author, the images, the videos, and the text that go with that article. And we not only identify where the text is, but we can tell you the topics. We do some natural language processing on the text and we can tell you &ldquo;This is about Apple,&rdquo; and we can tell it&rsquo;s about Apple Computer and not the fruit.</p>\n<p><strong>JD:</strong> Another opportunity we are excited about his how Diffbot can help augment what is natively on the page. Just by dint of following so many pages through our system, we can augment [the existing formatting] and increase the value for whoever is reading. In the case of an article, the fact that we see so many articles means it&rsquo;s relatively easy for us to generate tags for any given text.</p>\n<p><strong>X:</strong> How do you turn this all into a business?</p>\n<p><strong>MT:</strong> We are actually selling something. We are trying to build the Semantic Web, but in a profitable way. We analyze the pages that people pay us to analyze. That&rsquo;s currently over 100 million URLs per month, which is a good slice of the Web. Other startups have taken the approach of starting by crawling and indexing the Web, and that is very capital-intensive. By doing it this way, another benefit is that people only send us the best parts of </p>\n\t\t\t\t\t\t\t\t\t\t<p class=\"authorBio clearFix\">\n\t\t\t\t    Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can <a href=\"https://groups.google.com/forum/?hl=en_US&fromgroups#!forum/waderoush\">subscribe to his Google Group</a> or e-mail him at <a href=\"mailto:wroush@xconomy.com\">wroush@xconomy.com</a>.    \t\t\t\t\t\t\t\t</p>\n\t\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t</div><img alt=\"Diffbot\" class=\"attachment-200x9999 wp-post-image diffbot_image\" height=\"132\" src=\"http://www.xconomy.com/wordpress/wp-content/images/2012/07/diffbot-reclining-220x146.jpg\" width=\"200\"></img><br class=\"diffbot_nextPage\"> <img class=\"diffbot_image\" src=\"http://static.xconomy.com/Advertisers/25df47a9f35f4e1dad18a71f750e11a6.jpg\"></img><div>\n\t\t\t\n\t\t\t  \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t      \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t<p> (Page 4 of 4) </p>\n\t\t\t<p>the Web. Most of the stuff a typical Web crawler goes through never appears in any search results. Most of the Web is crap.</p>\n<p><strong>X:</strong> Are people finding uses for the technology that you may not have thought of?</p>\n<p><strong>MT:</strong> We had a hackathon last year where a guy came in and built an app for his father, who is blind. It runs Diffbot on a page and makes it into a radio station. For someone who is blind, browsing a news site is usually a really poor experience. The usual screen readers will read the entire page, including the nav bars and the ads and the text. The screen readers have no context about what is important on the page. Using Diffbot to be his father&rsquo;s eyes, this guy could parse the page and read it in a way that is much more natural.</p>\n<p><strong>JD:</strong> AOL&rsquo;s Editions app is one of the more interesting use cases that I&rsquo;ve seen. It&rsquo;s an iPad app that features both their own content as well as snippets from across the Web, in a daily issue. I spent five years running engineering for the media solutions group at Cisco, selling a Web platform for media companies, and the biggest problem we faced was dealing with the excess of content management systems that all media companies have. In the case of Editions, AOL has myriad properties that they want to merge into this single app. But rather than consolidate TechCrunch and Engadget and the Huffington Post and a half dozen other sites, they use Diffbot to build a kind of content management system on the fly from the rendered Web pages. They extract the content and deliver it on the fly as if it came from a CMS right to the iPad magazine.</p>\n<p>StumbleUpon is another interesting one. They use Diffbot as their moderation queue. Whenever a new website is submitted to their index, they want to make sure it&rsquo;s legitimate before it&rsquo;s available for stumbling. They have to rule out people who stumble a page, then swap it out for spam. So they run Diffbot on the source page, pipe that into their moderation queue, and if it looks like a legitimate page they can monitor that and keep checking on a regular basis to see how much it changes. If it has changed much between day 1 and day 10, it might warrant human intervention.</p>\n<p><strong>X:</strong> Aren&rsquo;t there are a lot of news reader app these days that are doing the same thing you&rsquo;re doing when it comes to identifying and isolating the text of a news article? That&rsquo;s what Instapaper and Pocket and Readability and Zite are all doing.</p>\n<p><strong>MT:</strong> We power a lot of those apps. Our audience is the developers who work at those companies, who use our API to create their experience.</p>\n<p><strong>JD:</strong> We make it a lot more affordable to make those kinds of forays. When you look at building your own customized extraction tools, you are talking about multiple developers over weeks or months, to build something that is more brittle than what we offer out of the gate. Our ultimate goal is to be not only better but a lot cheaper than what you could build.</p>\n<p><strong>X:</strong> It&rsquo;s not totally clear yet, though, whether publications or apps that aggregate lots of content from elsewhere, like Editions or even Flipboard, are going to be profitable in the long term, and where publishing is going as a business. Don&rsquo;t you guys feel there&rsquo;s some risk in tying your fortunes to such a troubled industry?</p>\n<p><strong>MT:</strong> The more interesting question is how do you monetize the Semantic Web, and where is the money in building the structured information. Articles are only one page type. Another that I mentioned is products. If you could show products on a cell phone, and people could buy the product and we could make that transaction happen, that is one very tangible way of making money. I think there is a lot of value in having structured information, because you can connect people more directly to what they want. Once we have the entire Web in machine-readable format, anybody who wants to use any sort of data can use the Diffbot view of it, and I think a lot of those apps can make money. Look at Siri&mdash;it&rsquo;s great but it only works with the 10 or so sources that it&rsquo;s hard-coded to work with. If you were able to combine Siri with Diffbot, Siri could operate on the Web and take a query and actually do it for you.</p>\n<p><strong>X:</strong> What page types will you move on to next? Did you start with articles because those are easiest?</p>\n<p><strong>MT:</strong> I wouldn&rsquo;t say they were easiest, but they are pretty prevalent on the Web. A variety of factors help us prioritize what we should do next. One signal is what is the prevalence of that type of page on the Web. If doing one page type lets us knock out 30 percent of the Web, maybe we will go for it.</p>\n<p><strong>X:</strong> Will there always be a need for Diffbot, or with the transition to HTML 5, will Web pages gradually get more structure on their own?</p>\n<p><strong>MT:</strong> If you look at the ratio of unstructured pages to structured, it&rsquo;s actually going in the opposite direction. I think human beings are creative, and they design pages for other humans. No matter what, people will find a way to create documents that lie outside of the well-defined tags, whether it&rsquo;s HTML 5 or Flash or PDF or Xbox. What they all have in common is that they are just vessels that we can easily train and adapt Diffbot to work with.<span class=\"read_more\">  <a href=\"http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/3/\"></a></span></p>\n\t\t\t\t\t\t\t\t\t\t<p class=\"authorBio clearFix\">\n\t\t\t\t    Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can <a href=\"https://groups.google.com/forum/?hl=en_US&fromgroups#!forum/waderoush\">subscribe to his Google Group</a> or e-mail him at <a href=\"mailto:wroush@xconomy.com\">wroush@xconomy.com</a>.    \t\t\t\t\t\t\t\t</p>\n\t\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t</div><img alt=\"Diffbot\" class=\"attachment-200x9999 wp-post-image diffbot_image\" height=\"132\" src=\"http://www.xconomy.com/wordpress/wp-content/images/2012/07/diffbot-reclining-220x146.jpg\" width=\"200\"></img><img alt=\"Stratos\" border=\"0\" class=\"diffbot_image\" height=\"250\" src=\"http://static.xconomy.com/Advertisers/c79cd8619bfd4209bfffe1c1602cee17.jpg\" title=\"Stratos\" width=\"300\"></img>",
+  "numPages": 4,
+  "date": "Wed, 25 Jul 2012 07:00:00 GMT",
+  "type": "article",
+  "human_language": "en",
+  "url": "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/"
+}

data/spec/symbolize_spec.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require 'spec_helper'
+module DiffbotSimple
+	describe Symbolize do
+		let(:test_data) { MultiJson.load File.read("spec/serialize_test_data.json") }
+		let(:subject) { Symbolize.symbolize test_data }
+		context "when symbolizing the test data" do
+			it "should not raise errors" do
+				expect{ subject }.to_not raise_error
+			end
+			it "should have :nextPages as an array" do
+				expect(subject[:nextPages]).to be_a Array
+			end
+		end
+	end
+end

data/spec/{analyze_spec.rb → v2/analyze_spec.rb} RENAMED Viewed

@@ -16,12 +16,12 @@ module DiffbotSimple::V2
 			it "should return the response body as an symbolized hash" do
 				expect(subject).to eql JSON.parse(single_response[:body], symbolize_names: true)
 			end
-			it "should respond and return the apis url in to_crawl_api_url" do
-				expect(analyze.to_crawl_api_url).to eql "#{api_url}?mode=auto"
+			it "should respond and return the apis url in to_api_url" do
+				expect(analyze.to_api_url).to eql "#{api_url}?mode=auto"
 			end
 		end
 		context "when asking for an analyze with no options" do
-			let(:subject) { analyze.single_analysis url: url}
+			let(:subject) { analyze.request url: url}
 			let(:stubbed_request) { stub_request(:get, api_url).with(query: {token: token, url: url}).to_return(single_response) }
 			it_should_behave_like "an analyze request"
 		end
@@ -29,7 +29,7 @@ module DiffbotSimple::V2
 			let(:fields) {"a,b,c"}
 			let(:mode) { "article" }
 			let(:stats) { true }
-			let(:subject) { analyze.single_analyze url: url, stats: stats, mode: mode, fields: fields }
+			let(:subject) { analyze.request url: url, stats: stats, mode: mode, fields: fields }
 			let(:stubbed_request) { stub_request(:get, api_url).with(query: {token: token, url: url, stats: stats.to_s, mode: mode, fields: fields}).to_return(single_response) }
 			it_should_behave_like "an analyze request"
 		end