diffbot_simple 0.0.4 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5858d1e4c275759f39cdb4ec782fa2e7e69e18bd
4
- data.tar.gz: 62eab98cb5df667fa5d47f7bcee6968774a3f14c
3
+ metadata.gz: ddb7b5f429829cc472112429600c1456fd5ba83e
4
+ data.tar.gz: 2ce6e971d8f396f5e0460dba844a3885fc4a55bd
5
5
  SHA512:
6
- metadata.gz: 10a2d414f7332febb4b96b4fb0e394463ee1f42b36e93cb328282e6015020d7f302707b33da354e410783322248a70f9d2fb22f4e5ade402bcce2cd5d1cfa64a
7
- data.tar.gz: 8804c3d2c4371cdf4757a7e36d09b015e4539b8a0e22cb1fa255f60252574fd4d7d76fbae34144e0393413d37d4ff30ca4012b22039c28f2dda47bbebeb339c2
6
+ metadata.gz: 2d90e57dfe1ee1b4fdf8380f4c3d29e293d0034b57dbf6984e76ed1bc64bed7c5965d3f8974be492544676836ca5269886c5870032a60cc30680d1fd04aa0d0b
7
+ data.tar.gz: 78206846dde0211f4f6d9e701caf3a3b8a5ff152b3981716622ac0c2d36535e746d7f8ec60868b674399bf079fb79d60d8d122e2bdbdd6dfa0e6a889b1642600
data/README.md CHANGED
@@ -8,8 +8,10 @@ DiffbotSimple
8
8
 
9
9
  A simple, nothing-fancy, helper for the [Diffbot API](http://www.diffbot.com/).
10
10
 
11
- Will not objectify any responses, just pass on the json data as hash with symbolized keys.
12
- One exception to that rule, when using CrawlBot and requesting a single_crawl, it will return the single item in the :jobs-array, and when requesting all, it will return the array in :jobs.
11
+ Will not objectify any responses, however Bulk and CrawlBot are a bit wrapped.
12
+ For these two apis, it will not care about the success-message and will only take care of things in the `:jobs`-array
13
+
14
+ For the other apis, just pass on the json data as hash with symbolized keys to the request.
13
15
  Send options to the api as named args, se usage below with article and fields-argument.
14
16
 
15
17
  ## Installation
@@ -33,7 +35,7 @@ client = DiffbotSimple::V2::Client.new token: token
33
35
  article = client.article
34
36
  url = "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/"
35
37
  # Pass on diffbot parameters as options to the call
36
- diffbot_response_as_symbolized_hash = article.single_article url: url, fields: "icon,title"
38
+ diffbot_response_as_symbolized_hash = article.request url: url, fields: "icon,title"
37
39
  # =>
38
40
  {
39
41
  icon: "http://www.xconomy.com/wordpress/wp-content/themes/xconomy/images/favicon.ico",
@@ -56,39 +58,60 @@ client = DiffbotSimple::V2::Client.new token: token
56
58
  url = "http://some_url_to_check"
57
59
 
58
60
  # Custom API
61
+ # will raise error if not exists, must create at http://www.diffbot.com/dev/customize/
59
62
  custom = client.custom name: "my_custom_api_name"
60
- response = custom.single_custom url: url
63
+ response = custom.request url: url
61
64
 
62
65
  # Analyze API (beta)
63
- analysis = client.analyze
64
- response = analyze.single_analysis url: url
66
+ analyze = client.analyze
67
+ response = analyze.request url: url
65
68
 
66
69
  # Article API
67
70
  article = client.article
68
- response = article.single_article url: url
71
+ response = article.request url: url
69
72
 
70
73
  # Image API
71
74
  image = client.image
72
- response = image.single_image url: url
75
+ response = image.request url: url
73
76
 
74
77
  # Product API
75
78
  product = client.product
76
- response = product.single_product url: url
79
+ response = product.request url: url
77
80
 
78
81
  # Crawlbot API
79
- crawlbot = client.crawlbot
80
- all_my_crawls = crawlbot.all
81
- current_settings = crawlbot.single_crawl name: "my_crawl"
82
+ all_my_crawls = client.crawl
83
+ crawl = client.crawl name: "mycrawl"
84
+ current_parameters = crawl.parameters
82
85
  # shorthand for using apiUrl, use the api object from client,
83
86
  # it will create a correct value for you
84
87
  # (custom, image, article, product or analyze for automatic)
85
- # A call to single_crawl will create if not exists or update settings
86
- settings = crawlbot.single_crawl name: "my_new_crawl", onlyProcessIfNew: 0, seeds: "http://www.upptec.se", apiUrl: custom
87
- crawlbot.pause name: "my_new_crawl"
88
- crawlbot.unpause name: "my_new_crawl"
89
- crawlbot.restart name: "my_new_crawl"
90
- result = crawlbot.result "my_new_crawl" # shorthand for downloading the json that are specifed in :downloadJson
91
- crawlbot.delete name: "my_new_crawl"
88
+ crawl.apiUrl = product # the object from above
89
+ # A call to client.crawl name: "mycrawl" will create if not exists
90
+ # (works with a symbol too, client.crawl name: :mycrawl)
91
+ # To update parameters:
92
+ craw.update onlyProcessIfNew: 0, seeds: "http://www.upptec.se", apiUrl: custom
93
+ # or by method, works only on loaded parameters
94
+ crawl.onlyProcessIfNew = 0 # sends update immediatly to diffbot
95
+ crawl.seeds = "http://www.upptec.se" # sends update immediatly to diffbot
96
+ # direct access by name to:
97
+ current_seeds = crawl.seeds
98
+ # actions:
99
+ crawl.pause
100
+ crawl.unpause
101
+ crawl.restart
102
+ # results is shorthand for downloading the json that are specifed in :downloadJson
103
+ results = crawl.results
104
+ crawl.delete!
105
+
106
+ # Bulk API
107
+ # is based on crawlbot and works exactly the same
108
+ all_my_bulk_jobs = client.bulk
109
+ bulk = client.bulk name: "mycrawl"
110
+ current_parameters = bulk.parameters
111
+ # and so forth as crawlbot above.
112
+ # however, you can add urls to process as an array using the #process method:
113
+ bulk.process ["http://foo.bar", "http://bar.foo"]
114
+
92
115
  ```
93
116
 
94
117
  ### On error
@@ -96,7 +119,6 @@ If Diffbot returns an error, it will raise and fill `DiffbotSimple::V2::DiffbotE
96
119
 
97
120
  ## TODO
98
121
  * Frontpage API
99
- * Bulk API
100
122
  * Async http fetching
101
123
  * Batch API
102
124
 
@@ -1,29 +1,32 @@
1
1
  module DiffbotSimple
2
2
  module Symbolize
3
- private
4
- def y_combinator(&f)
5
- lambda do |g|
6
- f.call {|*args| g[g][*args]}
7
- end.tap {|g| break g[g]}
8
- end
9
-
10
3
  def symbolize hash
11
4
  return hash unless hash.kind_of? Hash or hash.kind_of? Array
12
5
  sym_hash = y_combinator do |&f|
13
6
  lambda do |h|
14
7
  if h.kind_of? Array
15
8
  h.map {|r| f.call(r)}
16
- else
9
+ elsif h.kind_of? Hash
17
10
  h.reduce({}) do |memo,(k,v)|
18
11
  v = f.call(v) if v.kind_of? Hash
19
12
  v = v.map {|u| f.call(u)} if v.kind_of? Array
20
13
  memo[k.to_sym] = v
21
14
  memo
22
15
  end
16
+ else
17
+ h
23
18
  end
24
19
  end
25
20
  end
26
21
  sym_hash.call hash
27
22
  end
23
+
24
+ private
25
+ def y_combinator(&f)
26
+ lambda do |g|
27
+ f.call {|*args| g[g][*args]}
28
+ end.tap {|g| break g[g]}
29
+ end
30
+ module_function :symbolize, :y_combinator
28
31
  end
29
32
  end
@@ -5,14 +5,9 @@ module DiffbotSimple::V2
5
5
  def post_initialize
6
6
  @api = :analyze
7
7
  end
8
- def to_crawl_api_url
8
+ def to_api_url
9
9
  default = super
10
10
  "#{default}?mode=auto"
11
11
  end
12
- def single_analysis url: nil, **options
13
- raise ArgumentError.new "Must pass an url to fetch" unless url
14
- execute_call options.merge(url: url)
15
- end
16
- alias :single_analyze :single_analysis
17
12
  end
18
13
  end
@@ -9,9 +9,14 @@ module DiffbotSimple::V2
9
9
  def post_initialize
10
10
  raise "Must overload to set api path"
11
11
  end
12
- def to_crawl_api_url
12
+ def to_api_url
13
13
  "#{api_client.site}#{api}"
14
14
  end
15
+ # overload if necessary
16
+ def request url: nil, **options
17
+ raise ArgumentError.new "Must pass an url for the request to work" unless url
18
+ execute_call options.merge(url: url)
19
+ end
15
20
  private
16
21
  attr_reader :token, :api_client, :api
17
22
  def execute_call custom_headers: nil, method: :get, payload: nil, **options
@@ -33,7 +38,8 @@ module DiffbotSimple::V2
33
38
  merged
34
39
  end
35
40
  def expand_api_url api_url
36
- api_url.to_crawl_api_url if api_url.respond_to?(:to_crawl_api_url)
41
+ return api_url.to_api_url if api_url.respond_to?(:to_api_url)
42
+ return api_url
37
43
  end
38
44
  def raise_if_error_response result_from_diffbot
39
45
  return unless result_from_diffbot[:error]
@@ -5,7 +5,7 @@ module DiffbotSimple::V2
5
5
  def post_initialize
6
6
  @api = :article
7
7
  end
8
- def single_article url: nil, custom_headers: nil, body: nil, **options
8
+ def request url: nil, custom_headers: nil, body: nil, **options
9
9
  raise ArgumentError.new "Must pass an url for the article api to fetch" unless url
10
10
  if body
11
11
  custom_headers ||= {}
@@ -0,0 +1,11 @@
1
+ module DiffbotSimple::V2
2
+ class Bulk < Crawl
3
+ def initialize bulk_api: nil, name: nil, init: {}, **parameters
4
+ super parameters.merge(name: name, crawlbot_api: bulk_api, init: init)
5
+ end
6
+ def process urls_to_process
7
+ urls_to_process = [urls_to_process] unless urls_to_process.respond_to? :join
8
+ send_to_api urls: urls_to_process.join(" ")
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,8 @@
1
+ module DiffbotSimple::V2
2
+ # Complies to http://www.diffbot.com/dev/docs/bulk/
3
+ class BulkApi < CrawlbotApi
4
+ def post_initialize
5
+ @api = :bulk
6
+ end
7
+ end
8
+ end
@@ -1,13 +1,12 @@
1
1
  require_relative 'api_client'
2
2
  module DiffbotSimple::V2
3
3
  class Client
4
- def initialize token: nil
4
+ def initialize token: nil, bulk_api: nil, api_client: nil, crawlbot_api: nil
5
5
  raise ArgumentError.new("Must supply developer token") if token.to_s.empty?
6
6
  @token = token
7
- @api_client = ApiClient.new
8
- end
9
- def crawlbot
10
- Crawlbot.new api_client: api_client, token: token
7
+ @api_client = api_client ||= ApiClient.new
8
+ @bulk_api = bulk_api ||= BulkApi.new(api_client: api_client, token: token)
9
+ @crawlbot_api = crawlbot_api ||= CrawlbotApi.new(api_client: api_client, token: token)
11
10
  end
12
11
  def article
13
12
  Article.new api_client: api_client, token: token
@@ -24,7 +23,15 @@ module DiffbotSimple::V2
24
23
  def analyze
25
24
  Analyze.new api_client: api_client, token: token
26
25
  end
26
+ def bulk name: nil
27
+ return bulk_api.all.map { |e| Bulk.new name: e.delete(:name), init: e, bulk_api: bulk_api } unless name
28
+ return Bulk.new name: name, bulk_api: bulk_api
29
+ end
30
+ def crawl name: nil
31
+ return crawlbot_api.all.map { |e| Crawl.new name: e.delete(:name), init: e, crawlbot_api: crawlbot_api } unless name
32
+ return Crawl.new name: name, crawlbot_api: crawlbot_api
33
+ end
27
34
  private
28
- attr_reader :token, :api_client
35
+ attr_reader :token, :api_client, :bulk_api, :crawlbot_api
29
36
  end
30
37
  end
@@ -0,0 +1,53 @@
1
+ module DiffbotSimple::V2
2
+ class Crawl
3
+ attr_reader :parameters, :name
4
+ def initialize crawlbot_api: nil, name: nil, init: {}, **parameters
5
+ @crawlbot_api = crawlbot_api
6
+ @name = name
7
+ if init.empty?
8
+ send_to_api parameters
9
+ else
10
+ @parameters = init
11
+ end
12
+ end
13
+ def pause
14
+ send_to_api pause: 1
15
+ end
16
+ def unpause
17
+ send_to_api pause: 0
18
+ end
19
+ def delete!
20
+ send_to_api delete: 1
21
+ @parameters = {}
22
+ end
23
+ def restart
24
+ send_to_api restart: 1
25
+ end
26
+ def update **parameters
27
+ send_to_api parameters
28
+ end
29
+ def results
30
+ crawlbot_api.results url: parameters[:downloadJson]
31
+ end
32
+ def refresh
33
+ send_to_api
34
+ end
35
+ def apiUrl= api_url
36
+ send_to_api apiUrl: api_url
37
+ end
38
+ def method_missing property, *args
39
+ key = property.to_s.gsub(/\=$/,"").to_sym
40
+ super unless parameters.has_key? key
41
+ return send_to_api({ key => args.join(",") }) if property.to_s.match(/\=$/) or !args.empty?
42
+ return parameters[key]
43
+ end
44
+ private
45
+ attr_reader :crawlbot_api
46
+ def send_to_api **options
47
+ params = options.merge({name: name})
48
+ @parameters = crawlbot_api.single params
49
+ @parameters.delete :name
50
+ self
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,22 @@
1
+ module DiffbotSimple::V2
2
+ # Complies to http://www.diffbot.com/dev/docs/crawlboy/
3
+ class CrawlbotApi
4
+ include ApiHelper
5
+ def post_initialize
6
+ @api = :crawl
7
+ end
8
+ def all
9
+ execute_call()[:jobs].select { |e| e[:type] == @api.to_s }
10
+ end
11
+ def single name: nil, **options
12
+ response = execute_call options.merge(name: name)
13
+ return response[:jobs].select { |e| e[:type] == @api.to_s }.first if response.has_key?(:jobs)
14
+ response
15
+ end
16
+ def results url: nil
17
+ return [] unless url
18
+ response = api_client.get url
19
+ symbolize response
20
+ end
21
+ end
22
+ end
@@ -5,15 +5,11 @@ module DiffbotSimple::V2
5
5
  attr_reader :name
6
6
  def initialize name: nil, **options
7
7
  raise ArgumentError.new "Must pass a name for the custom api" unless name
8
- @name = name
8
+ @name = name.to_s
9
9
  super options
10
10
  end
11
11
  def post_initialize
12
12
  @api = "api/#{CGI::escape(name)}"
13
13
  end
14
- def single_custom url: nil, **options
15
- raise ArgumentError.new "Must pass an url for the custom api to fetch" unless url
16
- execute_call options.merge(url: url)
17
- end
18
14
  end
19
15
  end
@@ -5,9 +5,5 @@ module DiffbotSimple::V2
5
5
  def post_initialize
6
6
  @api = :image
7
7
  end
8
- def single_image url: nil, **options
9
- raise ArgumentError.new "Must pass an url to fetch" unless url
10
- execute_call options.merge(url: url)
11
- end
12
8
  end
13
9
  end
@@ -5,9 +5,5 @@ module DiffbotSimple::V2
5
5
  def post_initialize
6
6
  @api = :product
7
7
  end
8
- def single_product url: nil, **options
9
- raise ArgumentError.new "Must pass an url to fetch" unless url
10
- execute_call options.merge(url: url)
11
- end
12
8
  end
13
9
  end
@@ -1,3 +1,3 @@
1
1
  module DiffbotSimple
2
- VERSION = "0.0.4"
2
+ VERSION = "1.0.0"
3
3
  end
@@ -3,12 +3,15 @@ require 'diffbot_simple/symbolize'
3
3
  require 'diffbot_simple/v2/diffbot_error'
4
4
  require 'diffbot_simple/v2/api_helper'
5
5
  require 'diffbot_simple/v2/client'
6
- require 'diffbot_simple/v2/crawlbot'
6
+ require 'diffbot_simple/v2/crawlbot_api'
7
+ require 'diffbot_simple/v2/crawl'
7
8
  require 'diffbot_simple/v2/article'
8
9
  require 'diffbot_simple/v2/custom'
9
10
  require 'diffbot_simple/v2/product'
10
11
  require 'diffbot_simple/v2/image'
11
12
  require 'diffbot_simple/v2/analyze'
13
+ require 'diffbot_simple/v2/bulk_api'
14
+ require 'diffbot_simple/v2/bulk'
12
15
 
13
16
  module DiffbotSimple
14
17
  end
@@ -0,0 +1,38 @@
1
+ {
2
+ "icon": "http://www.xconomy.com/wordpress/wp-content/themes/xconomy/images/favicon.ico",
3
+ "author": "Wade Roush",
4
+ "text": "You know how the Picturephone , a half-billion-dollar project at AT&T back in the 1960s and 1970s, turned out to be a huge commercial flop, but two-way video communication eventually came back with a vengeance in the form of Skype and FaceTime and Google Hangouts? Well, something similar is going on with the Semantic Web.\n\nThat’s the proposal, dating back almost to the invention of the Web in the 1990s, that the various parts of Web pages should be tagged so that machines, as well as people, can make inferences based on the information they contain. The idea has never gotten very far, mainly because the burden of tagging all that content would fall to humans, which makes it expensive and tedious. But now it looks like the original goal of making digital content more comprehensible to computers might be achievable at far lower cost, thanks to better software.\n\nDiffbot is building that software. This unusual startup—the first ever to emerge from the Stanford-based accelerator , back in 2009—is using computer vision technology similar to that used for robotics applications such as self-driving cars to classify the parts of Web pages so that they can be reassembled in other forms. AOL is one of the startup’s first big customers and its landlord. It’s using Diffbot’s technology to assemble Editions by AOL , the personalized, iPad-based magazine comprised of content culled from AOL properties like the Huffington Post, TechCrunch, and Engadget.\n\n\nI went down to AOL’s Palo Alto campus last month to meet the company’s founder and CEO Mike Tung and its vice president of products John Davi. They didn’t deliberately set out to solve the Semantic Web problem, any more than the founders of Skype set out to build an affordable Picturephone. But their venture, which has attracted about $2 million in backing from Andy Bechtolsheim and a raft of other angel investing stars, is already on its way to creating one of the world’s largest structured indexes of unstructured Web content.\n\nWithout relying on HTML tags (which can actually be used to trick traditional Web crawling software), Diffbot can look at a news page and tell what’s a headline, what’s a byline, where the article text begins and ends, what’s an advertisement, and so forth. What practical use can companies make of that, and where’s the profit in it for Diffbot? Well, aside from AOL, the startup’s software is already being used in some interesting places: reading app maker uses it to extract article text from websites, and content discovery service employs it to screen out spam.\n\nIn fact, companies pay Diffbot to analyze more than 100 million unique URLs per month. And that’s just the beginning. Building outward from its early focus on news articles, the startup is creating new algorithms that could make sense of many kinds of sites, such as e-commerce catalogs. The individual elements of those sites could then be served up in almost any context. Imagine a Siri for shopping, to take just one example. “We’re building a series of wedges that will add up to a complete view of the Web,” says Davi. “We are excited about having them all under our belt, so there can be a fully indexed, reverse-engineered Semantic Web.”\n\nWhat follows is a highly compressed version of my conversation with Tung and Davi.\n\nXconomy: Where did you guys meet, and how did you end up working on Diffbot?\n\nMike Tung: I worked at Microsoft on Windows Vista right out of high school, then went to college at Cal and studied electrical engineering for two years, then went to Stanford to start a PhD in computer science, specializing in AI. When I first moved to Silicon Valley, I also worked at a bunch of startups. I was engineer number four at TheFind, which was a product search company that built the world’s largest product index. I worked on search at Yahoo and eBay, and also did a bunch of contract work. I took the patent bar and worked as a patent lawyer for a couple of years, writing 3G and 4G patents for Panasonic and Matsushita. I first met John when we were working at a startup called ClickTV, which was a video-player-search-engine thing. It was pretty advanced for its time.\n\nDiffbot began when I was in grad school at Stanford [in 2005]. There was this one quarter where I was taking a lot of classes, so I made this tool for myself to keep track of all of them. I would put in the URL for the class website, and whenever a professor would upload new slides or content, Diffbot would find that and download it to my phone. I always felt like I knew what was going on in my classes without having to attend every single one.\n\nIt was useful, and my friends started asking me whether they could use it. So I turned it into a Web service and \n\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can subscribe to his Google Group or e-mail him at wroush@xconomy.com .\n(Page 2 of 4) \n\n\t\t\tstarted running it out of a dorm at Stanford. And people started adding a bunch of different kinds of URLs to Diffbot outside of classes, like they might add Craigslist if they were searching for a job or a product, or Facebook if they wanted to see if their ex’s profile had changed.\n\nX: So I assume the name “Diffbot” related to comparing the old and new versions of a website and detecting the differences?\n\nMT: Yes, but just doing deltas on Web pages doesn’t work too well. It turns out that on the modern Web, every page refresh changes the ads and the counters. You have to be a little more intelligent.\n\nThat’s where understanding the page comes into play. I was studying machine learning at Stanford, and in particular one project I had worked on was the vision system for the self-driving car [Stanford’s entry in the 2007 DARPA Urban Challenge]. This was the stereo camera system that would compute the depth of a scene and say, ‘This is a cactus, this is drivable dirt, this is not drivable dirt, this is a cliff, this is a very narrow passageway.’ I realized that one way of making Diffbot generalizable was to apply computer vision to Web pages. Not to say, ‘This is a cactus and this is a pedestrian,’ but to say, ‘This is an advertisement and this is a footer and this is a product.’\n\nA human being can look at Web page and very easily tell what type of page it is without even looking at the text, and that is what we are teaching Diffbot to do. The goal is to build a machine-readable version of the entire Web.\n\nX: Isn’t that what Tim Berners-Lee has been talking about for years—building a Semantic Web that’s machine-readable?\n\nMT: It seems that every three years or so a new Semantic Web technology gets hyped up again. There was RSS, RDF, OWL, and now it’s Open Graph and the Knowledge Graph. The central problem—why none of these have really gone mainstream—is that you are requiring humans to tag the content twice, once for the machine’s benefit and once for the actual humans. Because you are placing so much onus on the content creators, you are never going to have all of the content in any given system. So it will be fragmented into different Semantic Web file formats, and because of that you will never have an app that allows you to search and evaluate all that information.\n\nBut what if you analyze the page itself? That is where we have an opportunity, by applying computer vision to eliminate the problem of manual tagging. And we have reached a certain point in the technology continuum where it is actually possible—where the CPUs are fast enough and the machine learning technology is good enough that we have a good shot of doing it with high accuracy.\n\nX: Why are you so convinced that a human-tagged Semantic Web would never work?\n\nMT: The number one point is that people are lazy. The second is that people lie. Google used to read the meta tags and keywords at the top of a Web page, and so people would start stuffing those areas with everything. It didn’t correspond to what actual humans saw. The same thing holds for Semantic Web formats. Whenever you have things indexed separately, you start to see spam. By using a robot to look at the page, you are keeping it above that.\n\nX: Talk about the computer vision aspect of Diffbot. How literal is the comparison to the cameras and radar on robot cars?\n\nMT: We use the very same techniques used in computer vision, for example object detection and edge detection. If you are a customer, you give us a URL to analyze. We render the page using a virtual Webkit browser in the cloud. It will render the page, run the Javascript, and lay everything out with the CSS rules and everything. Then we have these hooks into Webkit that \n\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can subscribe to his Google Group or e-mail him at wroush@xconomy.com .\n(Page 3 of 4) \n\n\t\t\tallow us to get all of the visual and geometric information out of the page. For every rectangle, we pull out things like the x and y coordinates, the heights and widths, the positioning relative to everything else, the font sizes, the colors, and other visual cues. In much the same way, when I was working on the self-driving car, we would look at a patch and do edge detection to determine the shape of a thing or find the horizon.\n\nX: Once you identify those shapes and other elements, how do you say, “This is a headline, this is an article,” et cetera?\n\nMT: We have an ontology. Other people have done good work defining what those ontologies should be—there are many of them at schema.org, which reflects what the search engines have proposed as ontologies. We also have human beings who draw rectangles on the pages and teach Diffbot “this is what an author field looks like, this is what a product looks like, this is what a price looks like,” and from those rectangles we can generalize. It’s a machine learning system, so it lives and breathes on the training data that is fed into it.\n\nX: Do you actually do all the training work yourselves, or do you crowdsource it out somehow?\n\nJohn Davi: We have done a combination of things. We always have a cold-start problem firing up new type of pages—products versus articles, or a new algorithm for press releases, for example. We leverage both grunt work internally—just grinding out our own examples, which has the side benefit of keeping us informed about the real world—but yeah, also crowdsourcing, which gives us a much broader variety of input and opinion. We have used everything, including off-the-shelf crowdsourcing tools like Mechanical Turk and Crowdflower, and we have build up our own group of quasi-contract crowdsourcers.\n\nOur basic effort is to cold-start it ourselves, then get an alpha-level product into the hands of our customer, which will then drastically increase the amount of training data we have. Sometimes we look at the stream of content and eyeball it and manually tweak and correct. In a lot of cases our customer gets involved. If they have an interest in helping to train the algorithm—it not only makes it better for them, but if they are first out of the gate they can tailor the algorithm to their very particular needs.\n\nX: How much can your algorithms tell about a Web page just from the way it looks? Are you also analyzing the actual text?\n\nMT: First we take a URL and determine what type of page it is. We’ve identified roughly 20 types of pages that all the Web can fall into. Article pages, people pages, product pages, photos, videos, and so on. So one of the fields we return will be what is the type of this thing. Then, depending on the type, there are other fields. For the article API [application programming interface], which is one we have out publicly, we can tell you the title, the author, the images, the videos, and the text that go with that article. And we not only identify where the text is, but we can tell you the topics. We do some natural language processing on the text and we can tell you “This is about Apple,” and we can tell it’s about Apple Computer and not the fruit.\n\nJD: Another opportunity we are excited about his how Diffbot can help augment what is natively on the page. Just by dint of following so many pages through our system, we can augment [the existing formatting] and increase the value for whoever is reading. In the case of an article, the fact that we see so many articles means it’s relatively easy for us to generate tags for any given text.\n\nX: How do you turn this all into a business?\n\nMT: We are actually selling something. We are trying to build the Semantic Web, but in a profitable way. We analyze the pages that people pay us to analyze. That’s currently over 100 million URLs per month, which is a good slice of the Web. Other startups have taken the approach of starting by crawling and indexing the Web, and that is very capital-intensive. By doing it this way, another benefit is that people only send us the best parts of \n\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can subscribe to his Google Group or e-mail him at wroush@xconomy.com .\n(Page 4 of 4) \n\n\t\t\tthe Web. Most of the stuff a typical Web crawler goes through never appears in any search results. Most of the Web is crap.\n\nX: Are people finding uses for the technology that you may not have thought of?\n\nMT: We had a hackathon last year where a guy came in and built an app for his father, who is blind. It runs Diffbot on a page and makes it into a radio station. For someone who is blind, browsing a news site is usually a really poor experience. The usual screen readers will read the entire page, including the nav bars and the ads and the text. The screen readers have no context about what is important on the page. Using Diffbot to be his father’s eyes, this guy could parse the page and read it in a way that is much more natural.\n\nJD: AOL’s Editions app is one of the more interesting use cases that I’ve seen. It’s an iPad app that features both their own content as well as snippets from across the Web, in a daily issue. I spent five years running engineering for the media solutions group at Cisco, selling a Web platform for media companies, and the biggest problem we faced was dealing with the excess of content management systems that all media companies have. In the case of Editions, AOL has myriad properties that they want to merge into this single app. But rather than consolidate TechCrunch and Engadget and the Huffington Post and a half dozen other sites, they use Diffbot to build a kind of content management system on the fly from the rendered Web pages. They extract the content and deliver it on the fly as if it came from a CMS right to the iPad magazine.\n\nStumbleUpon is another interesting one. They use Diffbot as their moderation queue. Whenever a new website is submitted to their index, they want to make sure it’s legitimate before it’s available for stumbling. They have to rule out people who stumble a page, then swap it out for spam. So they run Diffbot on the source page, pipe that into their moderation queue, and if it looks like a legitimate page they can monitor that and keep checking on a regular basis to see how much it changes. If it has changed much between day 1 and day 10, it might warrant human intervention.\n\nX: Aren’t there are a lot of news reader app these days that are doing the same thing you’re doing when it comes to identifying and isolating the text of a news article? That’s what Instapaper and Pocket and Readability and Zite are all doing.\n\nMT: We power a lot of those apps. Our audience is the developers who work at those companies, who use our API to create their experience.\n\nJD: We make it a lot more affordable to make those kinds of forays. When you look at building your own customized extraction tools, you are talking about multiple developers over weeks or months, to build something that is more brittle than what we offer out of the gate. Our ultimate goal is to be not only better but a lot cheaper than what you could build.\n\nX: It’s not totally clear yet, though, whether publications or apps that aggregate lots of content from elsewhere, like Editions or even Flipboard, are going to be profitable in the long term, and where publishing is going as a business. Don’t you guys feel there’s some risk in tying your fortunes to such a troubled industry?\n\nMT: The more interesting question is how do you monetize the Semantic Web, and where is the money in building the structured information. Articles are only one page type. Another that I mentioned is products. If you could show products on a cell phone, and people could buy the product and we could make that transaction happen, that is one very tangible way of making money. I think there is a lot of value in having structured information, because you can connect people more directly to what they want. Once we have the entire Web in machine-readable format, anybody who wants to use any sort of data can use the Diffbot view of it, and I think a lot of those apps can make money. Look at Siri—it’s great but it only works with the 10 or so sources that it’s hard-coded to work with. If you were able to combine Siri with Diffbot, Siri could operate on the Web and take a query and actually do it for you.\n\nX: What page types will you move on to next? Did you start with articles because those are easiest?\n\nMT: I wouldn’t say they were easiest, but they are pretty prevalent on the Web. A variety of factors help us prioritize what we should do next. One signal is what is the prevalence of that type of page on the Web. If doing one page type lets us knock out 30 percent of the Web, maybe we will go for it.\n\nX: Will there always be a need for Diffbot, or with the transition to HTML 5, will Web pages gradually get more structure on their own?\n\nMT: If you look at the ratio of unstructured pages to structured, it’s actually going in the opposite direction. I think human beings are creative, and they design pages for other humans. No matter what, people will find a way to create documents that lie outside of the well-defined tags, whether it’s HTML 5 or Flash or PDF or Xbox. What they all have in common is that they are just vessels that we can easily train and adapt Diffbot to work with.\n\n\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can subscribe to his Google Group or e-mail him at wroush@xconomy.com .",
5
+ "title": "Diffbot Is Using Computer Vision to Reinvent the Semantic Web",
6
+ "nextPages": [
7
+ "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/2/",
8
+ "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/3/",
9
+ "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/4/"
10
+ ],
11
+ "images": [
12
+ {
13
+ "primary": "true",
14
+ "url": "http://www.xconomy.com/wordpress/wp-content/images/2012/07/diffbot-reclining-220x146.jpg"
15
+ },
16
+ {
17
+ "caption": "NPR's top news page as interpreted by Diffbot (click for larger version)",
18
+ "url": "http://www.xconomy.com/wordpress/wp-content/images/2012/07/Screen-Shot-2012-07-25-at-9.13.27-AM-300x332.png"
19
+ },
20
+ {
21
+ "primary": "true",
22
+ "url": "http://static.xconomy.com/Advertisers/25df47a9f35f4e1dad18a71f750e11a6.jpg"
23
+ },
24
+ {
25
+ "caption": "Diffbot robot",
26
+ "url": "http://www.xconomy.com/wordpress/wp-content/images/2012/07/12-220x265.png"
27
+ },
28
+ {
29
+ "url": "http://static.xconomy.com/Advertisers/c79cd8619bfd4209bfffe1c1602cee17.jpg"
30
+ }
31
+ ],
32
+ "html": "<img alt=\"Diffbot\" class=\"attachment-200x9999 wp-post-image diffbot_image\" height=\"132\" src=\"http://www.xconomy.com/wordpress/wp-content/images/2012/07/diffbot-reclining-220x146.jpg\" width=\"200\"></img><div>\n\t\t\t\n\t\t\t \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t<p>You know how the <a class=\"ext-link\" href=\"http://www.corp.att.com/attlabs/reputation/timeline/70picture.html\" onclick=\"javascript:_gaq.push(['_trackEvent','outbound-article','http://www.corp.att.com']);\" rel=\"external\" target=\"_blank\" title=\"\">Picturephone</a>, a half-billion-dollar project at AT&amp;T back in the 1960s and 1970s, turned out to be a huge commercial flop, but two-way video communication eventually came back with a vengeance in the form of Skype and FaceTime and Google Hangouts? Well, something similar is going on with the Semantic Web.</p>\n<p>That&rsquo;s the proposal, dating back almost to the invention of the Web in the 1990s, that the various parts of Web pages should be tagged so that machines, as well as people, can make inferences based on the information they contain. The idea has never gotten very far, mainly because the burden of tagging all that content would fall to humans, which makes it expensive and tedious. But now it looks like the original goal of making digital content more comprehensible to computers might be achievable at far lower cost, thanks to better software.</p>\n<p><a class=\"ext-link\" href=\"http://www.diffbot.com\" onclick=\"javascript:_gaq.push(['_trackEvent','outbound-article','http://www.diffbot.com']);\" rel=\"external\" target=\"_blank\" title=\"\">Diffbot</a> is building that software. This unusual startup&mdash;the first ever to emerge from the Stanford-based accelerator , back in 2009&mdash;is using computer vision technology similar to that used for robotics applications such as self-driving cars to classify the parts of Web pages so that they can be reassembled in other forms. AOL is one of the startup&rsquo;s first big customers and its landlord. It&rsquo;s using Diffbot&rsquo;s technology to assemble <a href=\"http://www.xconomy.com/national/2012/01/20/news-readers/2/\">Editions by AOL</a>, the personalized, iPad-based magazine comprised of content culled from AOL properties like the Huffington Post, TechCrunch, and Engadget.</p>\n\n<p>I went down to AOL&rsquo;s Palo Alto campus last month to meet the company&rsquo;s founder and CEO Mike Tung and its vice president of products John Davi. They didn&rsquo;t deliberately set out to solve the Semantic Web problem, any more than the founders of Skype set out to build an affordable Picturephone. But their venture, which <a href=\"http://www.xconomy.com/san-francisco/2012/06/01/diffbot-garners-2000000-new-round/\">has attracted about $2 million</a> in backing from Andy Bechtolsheim and a raft of other angel investing stars, is already on its way to creating one of the world&rsquo;s largest structured indexes of unstructured Web content.</p>\n<p>Without relying on HTML tags (which can actually be used to trick traditional Web crawling software), Diffbot can look at a news page and tell what&rsquo;s a headline, what&rsquo;s a byline, where the article text begins and ends, what&rsquo;s an advertisement, and so forth. What practical use can companies make of that, and where&rsquo;s the profit in it for Diffbot? Well, aside from AOL, the startup&rsquo;s software is already being used in some interesting places: reading app maker uses it to extract article text from websites, and content discovery service employs it to screen out spam.</p>\n<p>In fact, companies pay Diffbot to analyze more than 100 million unique URLs per month. And that&rsquo;s just the beginning. Building outward from its early focus on news articles, the startup is creating new algorithms that could make sense of many kinds of sites, such as e-commerce catalogs. The individual elements of those sites could then be served up in almost any context. Imagine a Siri for shopping, to take just one example. &ldquo;We&rsquo;re building a series of wedges that will add up to a complete view of the Web,&rdquo; says Davi. &ldquo;We are excited about having them all under our belt, so there can be a fully indexed, reverse-engineered Semantic Web.&rdquo;</p>\n<p>What follows is a highly compressed version of my conversation with Tung and Davi.</p>\n<p><strong>Xconomy:</strong> Where did you guys meet, and how did you end up working on Diffbot?</p>\n<p><strong>Mike Tung:</strong> I worked at Microsoft on Windows Vista right out of high school, then went to college at Cal and studied electrical engineering for two years, then went to Stanford to start a PhD in computer science, specializing in AI. When I first moved to Silicon Valley, I also worked at a bunch of startups. I was engineer number four at TheFind, which was a product search company that built the world&rsquo;s largest product index. I worked on search at Yahoo and eBay, and also did a bunch of contract work. I took the patent bar and worked as a patent lawyer for a couple of years, writing 3G and 4G patents for Panasonic and Matsushita. I first met John when we were working at a startup called ClickTV, which was a video-player-search-engine thing. It was pretty advanced for its time.</p>\n<p>Diffbot began when I was in grad school at Stanford [in 2005]. There was this one quarter where I was taking a lot of classes, so I made this tool for myself to keep track of all of them. I would put in the URL for the class website, and whenever a professor would upload new slides or content, Diffbot would find that and download it to my phone. I always felt like I knew what was going on in my classes without having to attend every single one.</p>\n<p>It was useful, and my friends started asking me whether they could use it. So I turned it into a Web service and </p>\n\t\t\t\t\t\t\t\t\t\t<p class=\"authorBio clearFix\">\n\t\t\t\t Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can <a href=\"https://groups.google.com/forum/?hl=en_US&fromgroups#!forum/waderoush\">subscribe to his Google Group</a> or e-mail him at <a href=\"mailto:wroush@xconomy.com\">wroush@xconomy.com</a>. \t\t\t\t\t\t\t\t</p>\n\t\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t</div><img alt=\"\" class=\"size-large wp-image-198170 diffbot_image\" height=\"332\" src=\"http://www.xconomy.com/wordpress/wp-content/images/2012/07/Screen-Shot-2012-07-25-at-9.13.27-AM-300x332.png\" title=\"NPR's top news page as interpreted by Diffbot\" width=\"300\"></img><br class=\"diffbot_nextPage\"> <img alt=\"Diffbot\" class=\"attachment-200x9999 wp-post-image diffbot_image\" height=\"132\" src=\"http://www.xconomy.com/wordpress/wp-content/images/2012/07/diffbot-reclining-220x146.jpg\" width=\"200\"></img><div>\n\t\t\t\n\t\t\t \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t<p> (Page 2 of 4) </p>\n\t\t\t<p>started running it out of a dorm at Stanford. And people started adding a bunch of different kinds of URLs to Diffbot outside of classes, like they might add Craigslist if they were searching for a job or a product, or Facebook if they wanted to see if their ex&rsquo;s profile had changed.</p>\n<p><strong>X:</strong> So I assume the name &ldquo;Diffbot&rdquo; related to comparing the old and new versions of a website and detecting the differences?</p>\n<p><strong>MT:</strong> Yes, but just doing deltas on Web pages doesn&rsquo;t work too well. It turns out that on the modern Web, every page refresh changes the ads and the counters. You have to be a little more intelligent.</p>\n<p>That&rsquo;s where understanding the page comes into play. I was studying machine learning at Stanford, and in particular one project I had worked on was the vision system for the self-driving car [Stanford&rsquo;s entry in the 2007 DARPA Urban Challenge]. This was the stereo camera system that would compute the depth of a scene and say, &lsquo;This is a cactus, this is drivable dirt, this is not drivable dirt, this is a cliff, this is a very narrow passageway.&rsquo; I realized that one way of making Diffbot generalizable was to apply computer vision to Web pages. Not to say, &lsquo;This is a cactus and this is a pedestrian,&rsquo; but to say, &lsquo;This is an advertisement and this is a footer and this is a product.&rsquo;</p>\n<p>A human being can look at Web page and very easily tell what type of page it is without even looking at the text, and that is what we are teaching Diffbot to do. The goal is to build a machine-readable version of the entire Web.</p>\n<p><strong>X:</strong> Isn&rsquo;t that what Tim Berners-Lee has been talking about for years&mdash;building a Semantic Web that&rsquo;s machine-readable?</p>\n<p><strong>MT:</strong> It seems that every three years or so a new Semantic Web technology gets hyped up again. There was RSS, RDF, OWL, and now it&rsquo;s Open Graph and the Knowledge Graph. The central problem&mdash;why none of these have really gone mainstream&mdash;is that you are requiring humans to tag the content twice, once for the machine&rsquo;s benefit and once for the actual humans. Because you are placing so much onus on the content creators, you are never going to have all of the content in any given system. So it will be fragmented into different Semantic Web file formats, and because of that you will never have an app that allows you to search and evaluate all that information.</p>\n<p>But what if you analyze the page itself? That is where we have an opportunity, by applying computer vision to eliminate the problem of manual tagging. And we have reached a certain point in the technology continuum where it is actually possible&mdash;where the CPUs are fast enough and the machine learning technology is good enough that we have a good shot of doing it with high accuracy.</p>\n<p><strong>X:</strong> Why are you so convinced that a human-tagged Semantic Web would never work?</p>\n<p><strong>MT:</strong> The number one point is that people are lazy. The second is that people lie. Google used to read the meta tags and keywords at the top of a Web page, and so people would start stuffing those areas with everything. It didn&rsquo;t correspond to what actual humans saw. The same thing holds for Semantic Web formats. Whenever you have things indexed separately, you start to see spam. By using a robot to look at the page, you are keeping it above that.</p>\n<p><strong>X:</strong> Talk about the computer vision aspect of Diffbot. How literal is the comparison to the cameras and radar on robot cars?</p>\n<p><strong>MT:</strong> We use the very same techniques used in computer vision, for example object detection and edge detection. If you are a customer, you give us a URL to analyze. We render the page using a virtual Webkit browser in the cloud. It will render the page, run the Javascript, and lay everything out with the CSS rules and everything. Then we have these hooks into Webkit that </p>\n\t\t\t\t\t\t\t\t\t\t<p class=\"authorBio clearFix\">\n\t\t\t\t Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can <a href=\"https://groups.google.com/forum/?hl=en_US&fromgroups#!forum/waderoush\">subscribe to his Google Group</a> or e-mail him at <a href=\"mailto:wroush@xconomy.com\">wroush@xconomy.com</a>. \t\t\t\t\t\t\t\t</p>\n\t\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t</div><br class=\"diffbot_nextPage\"> <img class=\"diffbot_image\" src=\"http://static.xconomy.com/Advertisers/25df47a9f35f4e1dad18a71f750e11a6.jpg\"></img><div>\n\t\t\t\n\t\t\t \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t<p> (Page 3 of 4) </p>\n\t\t\t<p>allow us to get all of the visual and geometric information out of the page. For every rectangle, we pull out things like the x and y coordinates, the heights and widths, the positioning relative to everything else, the font sizes, the colors, and other visual cues. In much the same way, when I was working on the self-driving car, we would look at a patch and do edge detection to determine the shape of a thing or find the horizon.</p>\n<p><strong>X:</strong> Once you identify those shapes and other elements, how do you say, &ldquo;This is a headline, this is an article,&rdquo; et cetera?</p>\n<p><strong>MT:</strong> We have an ontology. Other people have done good work defining what those ontologies should be&mdash;there are many of them at schema.org, which reflects what the search engines have proposed as ontologies. We also have human beings who draw rectangles on the pages and teach Diffbot &ldquo;this is what an author field looks like, this is what a product looks like, this is what a price looks like,&rdquo; and from those rectangles we can generalize. It&rsquo;s a machine learning system, so it lives and breathes on the training data that is fed into it.</p>\n<p><strong>X:</strong> Do you actually do all the training work yourselves, or do you crowdsource it out somehow?</p>\n<p><strong><a href=\"http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/attachment/12-3/\" rel=\"attachment wp-att-198179\"><img alt=\"\" class=\"alignleft size-medium wp-image-198179 diffbot_image\" height=\"265\" src=\"http://www.xconomy.com/wordpress/wp-content/images/2012/07/12-220x265.png\" title=\"Diffbot robot\" width=\"220\"></img></a>John Davi:</strong> We have done a combination of things. We always have a cold-start problem firing up new type of pages&mdash;products versus articles, or a new algorithm for press releases, for example. We leverage both grunt work internally&mdash;just grinding out our own examples, which has the side benefit of keeping us informed about the real world&mdash;but yeah, also crowdsourcing, which gives us a much broader variety of input and opinion. We have used everything, including off-the-shelf crowdsourcing tools like Mechanical Turk and Crowdflower, and we have build up our own group of quasi-contract crowdsourcers.</p>\n<p>Our basic effort is to cold-start it ourselves, then get an alpha-level product into the hands of our customer, which will then drastically increase the amount of training data we have. Sometimes we look at the stream of content and eyeball it and manually tweak and correct. In a lot of cases our customer gets involved. If they have an interest in helping to train the algorithm&mdash;it not only makes it better for them, but if they are first out of the gate they can tailor the algorithm to their very particular needs.</p>\n<p><strong>X:</strong> How much can your algorithms tell about a Web page just from the way it looks? Are you also analyzing the actual text?</p>\n<p><strong>MT:</strong> First we take a URL and determine what type of page it is. We&rsquo;ve identified roughly 20 types of pages that all the Web can fall into. Article pages, people pages, product pages, photos, videos, and so on. So one of the fields we return will be what is the type of this thing. Then, depending on the type, there are other fields. For the article API [application programming interface], which is one we have out publicly, we can tell you the title, the author, the images, the videos, and the text that go with that article. And we not only identify where the text is, but we can tell you the topics. We do some natural language processing on the text and we can tell you &ldquo;This is about Apple,&rdquo; and we can tell it&rsquo;s about Apple Computer and not the fruit.</p>\n<p><strong>JD:</strong> Another opportunity we are excited about his how Diffbot can help augment what is natively on the page. Just by dint of following so many pages through our system, we can augment [the existing formatting] and increase the value for whoever is reading. In the case of an article, the fact that we see so many articles means it&rsquo;s relatively easy for us to generate tags for any given text.</p>\n<p><strong>X:</strong> How do you turn this all into a business?</p>\n<p><strong>MT:</strong> We are actually selling something. We are trying to build the Semantic Web, but in a profitable way. We analyze the pages that people pay us to analyze. That&rsquo;s currently over 100 million URLs per month, which is a good slice of the Web. Other startups have taken the approach of starting by crawling and indexing the Web, and that is very capital-intensive. By doing it this way, another benefit is that people only send us the best parts of </p>\n\t\t\t\t\t\t\t\t\t\t<p class=\"authorBio clearFix\">\n\t\t\t\t Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can <a href=\"https://groups.google.com/forum/?hl=en_US&fromgroups#!forum/waderoush\">subscribe to his Google Group</a> or e-mail him at <a href=\"mailto:wroush@xconomy.com\">wroush@xconomy.com</a>. \t\t\t\t\t\t\t\t</p>\n\t\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t</div><img alt=\"Diffbot\" class=\"attachment-200x9999 wp-post-image diffbot_image\" height=\"132\" src=\"http://www.xconomy.com/wordpress/wp-content/images/2012/07/diffbot-reclining-220x146.jpg\" width=\"200\"></img><br class=\"diffbot_nextPage\"> <img class=\"diffbot_image\" src=\"http://static.xconomy.com/Advertisers/25df47a9f35f4e1dad18a71f750e11a6.jpg\"></img><div>\n\t\t\t\n\t\t\t \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t \t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\t<p> (Page 4 of 4) </p>\n\t\t\t<p>the Web. Most of the stuff a typical Web crawler goes through never appears in any search results. Most of the Web is crap.</p>\n<p><strong>X:</strong> Are people finding uses for the technology that you may not have thought of?</p>\n<p><strong>MT:</strong> We had a hackathon last year where a guy came in and built an app for his father, who is blind. It runs Diffbot on a page and makes it into a radio station. For someone who is blind, browsing a news site is usually a really poor experience. The usual screen readers will read the entire page, including the nav bars and the ads and the text. The screen readers have no context about what is important on the page. Using Diffbot to be his father&rsquo;s eyes, this guy could parse the page and read it in a way that is much more natural.</p>\n<p><strong>JD:</strong> AOL&rsquo;s Editions app is one of the more interesting use cases that I&rsquo;ve seen. It&rsquo;s an iPad app that features both their own content as well as snippets from across the Web, in a daily issue. I spent five years running engineering for the media solutions group at Cisco, selling a Web platform for media companies, and the biggest problem we faced was dealing with the excess of content management systems that all media companies have. In the case of Editions, AOL has myriad properties that they want to merge into this single app. But rather than consolidate TechCrunch and Engadget and the Huffington Post and a half dozen other sites, they use Diffbot to build a kind of content management system on the fly from the rendered Web pages. They extract the content and deliver it on the fly as if it came from a CMS right to the iPad magazine.</p>\n<p>StumbleUpon is another interesting one. They use Diffbot as their moderation queue. Whenever a new website is submitted to their index, they want to make sure it&rsquo;s legitimate before it&rsquo;s available for stumbling. They have to rule out people who stumble a page, then swap it out for spam. So they run Diffbot on the source page, pipe that into their moderation queue, and if it looks like a legitimate page they can monitor that and keep checking on a regular basis to see how much it changes. If it has changed much between day 1 and day 10, it might warrant human intervention.</p>\n<p><strong>X:</strong> Aren&rsquo;t there are a lot of news reader app these days that are doing the same thing you&rsquo;re doing when it comes to identifying and isolating the text of a news article? That&rsquo;s what Instapaper and Pocket and Readability and Zite are all doing.</p>\n<p><strong>MT:</strong> We power a lot of those apps. Our audience is the developers who work at those companies, who use our API to create their experience.</p>\n<p><strong>JD:</strong> We make it a lot more affordable to make those kinds of forays. When you look at building your own customized extraction tools, you are talking about multiple developers over weeks or months, to build something that is more brittle than what we offer out of the gate. Our ultimate goal is to be not only better but a lot cheaper than what you could build.</p>\n<p><strong>X:</strong> It&rsquo;s not totally clear yet, though, whether publications or apps that aggregate lots of content from elsewhere, like Editions or even Flipboard, are going to be profitable in the long term, and where publishing is going as a business. Don&rsquo;t you guys feel there&rsquo;s some risk in tying your fortunes to such a troubled industry?</p>\n<p><strong>MT:</strong> The more interesting question is how do you monetize the Semantic Web, and where is the money in building the structured information. Articles are only one page type. Another that I mentioned is products. If you could show products on a cell phone, and people could buy the product and we could make that transaction happen, that is one very tangible way of making money. I think there is a lot of value in having structured information, because you can connect people more directly to what they want. Once we have the entire Web in machine-readable format, anybody who wants to use any sort of data can use the Diffbot view of it, and I think a lot of those apps can make money. Look at Siri&mdash;it&rsquo;s great but it only works with the 10 or so sources that it&rsquo;s hard-coded to work with. If you were able to combine Siri with Diffbot, Siri could operate on the Web and take a query and actually do it for you.</p>\n<p><strong>X:</strong> What page types will you move on to next? Did you start with articles because those are easiest?</p>\n<p><strong>MT:</strong> I wouldn&rsquo;t say they were easiest, but they are pretty prevalent on the Web. A variety of factors help us prioritize what we should do next. One signal is what is the prevalence of that type of page on the Web. If doing one page type lets us knock out 30 percent of the Web, maybe we will go for it.</p>\n<p><strong>X:</strong> Will there always be a need for Diffbot, or with the transition to HTML 5, will Web pages gradually get more structure on their own?</p>\n<p><strong>MT:</strong> If you look at the ratio of unstructured pages to structured, it&rsquo;s actually going in the opposite direction. I think human beings are creative, and they design pages for other humans. No matter what, people will find a way to create documents that lie outside of the well-defined tags, whether it&rsquo;s HTML 5 or Flash or PDF or Xbox. What they all have in common is that they are just vessels that we can easily train and adapt Diffbot to work with.<span class=\"read_more\"> <a href=\"http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/3/\"></a></span></p>\n\t\t\t\t\t\t\t\t\t\t<p class=\"authorBio clearFix\">\n\t\t\t\t Wade Roush is Xconomy's chief correspondent and editor of Xconomy San Francisco. You can <a href=\"https://groups.google.com/forum/?hl=en_US&fromgroups#!forum/waderoush\">subscribe to his Google Group</a> or e-mail him at <a href=\"mailto:wroush@xconomy.com\">wroush@xconomy.com</a>. \t\t\t\t\t\t\t\t</p>\n\t\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t</div><img alt=\"Diffbot\" class=\"attachment-200x9999 wp-post-image diffbot_image\" height=\"132\" src=\"http://www.xconomy.com/wordpress/wp-content/images/2012/07/diffbot-reclining-220x146.jpg\" width=\"200\"></img><img alt=\"Stratos\" border=\"0\" class=\"diffbot_image\" height=\"250\" src=\"http://static.xconomy.com/Advertisers/c79cd8619bfd4209bfffe1c1602cee17.jpg\" title=\"Stratos\" width=\"300\"></img>",
33
+ "numPages": 4,
34
+ "date": "Wed, 25 Jul 2012 07:00:00 GMT",
35
+ "type": "article",
36
+ "human_language": "en",
37
+ "url": "http://www.xconomy.com/san-francisco/2012/07/25/diffbot-is-using-computer-vision-to-reinvent-the-semantic-web/"
38
+ }
@@ -0,0 +1,17 @@
1
+ require 'spec_helper'
2
+
3
+ module DiffbotSimple
4
+ describe Symbolize do
5
+ let(:test_data) { MultiJson.load File.read("spec/serialize_test_data.json") }
6
+ let(:subject) { Symbolize.symbolize test_data }
7
+ context "when symbolizing the test data" do
8
+ it "should not raise errors" do
9
+ expect{ subject }.to_not raise_error
10
+ end
11
+ it "should have :nextPages as an array" do
12
+ expect(subject[:nextPages]).to be_a Array
13
+ end
14
+ end
15
+ end
16
+
17
+ end
@@ -16,12 +16,12 @@ module DiffbotSimple::V2
16
16
  it "should return the response body as an symbolized hash" do
17
17
  expect(subject).to eql JSON.parse(single_response[:body], symbolize_names: true)
18
18
  end
19
- it "should respond and return the apis url in to_crawl_api_url" do
20
- expect(analyze.to_crawl_api_url).to eql "#{api_url}?mode=auto"
19
+ it "should respond and return the apis url in to_api_url" do
20
+ expect(analyze.to_api_url).to eql "#{api_url}?mode=auto"
21
21
  end
22
22
  end
23
23
  context "when asking for an analyze with no options" do
24
- let(:subject) { analyze.single_analysis url: url}
24
+ let(:subject) { analyze.request url: url}
25
25
  let(:stubbed_request) { stub_request(:get, api_url).with(query: {token: token, url: url}).to_return(single_response) }
26
26
  it_should_behave_like "an analyze request"
27
27
  end
@@ -29,7 +29,7 @@ module DiffbotSimple::V2
29
29
  let(:fields) {"a,b,c"}
30
30
  let(:mode) { "article" }
31
31
  let(:stats) { true }
32
- let(:subject) { analyze.single_analyze url: url, stats: stats, mode: mode, fields: fields }
32
+ let(:subject) { analyze.request url: url, stats: stats, mode: mode, fields: fields }
33
33
  let(:stubbed_request) { stub_request(:get, api_url).with(query: {token: token, url: url, stats: stats.to_s, mode: mode, fields: fields}).to_return(single_response) }
34
34
  it_should_behave_like "an analyze request"
35
35
  end