husc 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 18698eb5d732d0d86377f031fd3dda73ff368c7ad4b10f9c7a5a45e08f325a98
4
- data.tar.gz: 9907d28762adb13ac57ae7da4a530f23ce57794f71fb2a0c502a54a53d8b73f7
3
+ metadata.gz: 70da435ce2b15bb485ce958997a91488cf19f00b7faf5da3f25c92a891028508
4
+ data.tar.gz: b363fa06b547c1a5612889739465af7ea46cbeea6b6922aab0d484fa21169ce6
5
5
  SHA512:
6
- metadata.gz: 698671388fa2b6da20b2af46e24d86eaad31f6d06a28633d83795eaf9b33473ad65c36a75791a7434414a9f64f895409b1097a786b9c79cb2e4669bb6d73140a
7
- data.tar.gz: e856cb96440b8faa60f159becc6cdf26418462c57add0d46ffce5fb40f4d6193689fb05eff0867de144f5e175df60e398a1238b75bc1cf097eed21aa7ad6a875
6
+ metadata.gz: 2190d0269954730626eca1d142c8ffd00614bf98fb19d7ba4d595668e944d769636f26c151211a664ba775b2d9f1c3b5594bbd7d5cde05eec7af31e11f82dfff
7
+ data.tar.gz: 5f0ea49baf5d2fbf6a707f3b580e95364686c2a3e1439d8c3469c090335790ebeef062176e3d67e3afd7592b23e2256f92f6421a14e4b208c94b4e5a1e737a7f
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
- Crawler
1
+ Husc
2
2
  =======
3
3
 
4
- Script for crawling in Ruby
4
+ A simple crawling utility for Ruby.
5
5
 
6
6
 
7
7
  ## Description
@@ -16,29 +16,74 @@ This project enables site crawling and data extraction with xpath and css select
16
16
  ## Usage
17
17
  ### Simple Example
18
18
  ```ruby
19
- require './rbcrawl.rb'
19
+ require 'husc'
20
20
 
21
21
  url = 'http://www.example.com/'
22
- doc = RbCrawl.new(url)
22
+ doc = Husc(url)
23
23
 
24
- # Search for nodes by css
24
+ # access another url
25
+ doc.get('another url')
26
+
27
+ # get current url
28
+ doc.url
29
+
30
+ # get current site's html
31
+ doc.html
32
+
33
+ # get <table> tags as dict
34
+ doc.tables
35
+ # ex) doc.tables['予約・お問い合わせ'] => 050-5596-6465
36
+ ```
37
+
38
+ ### Scraping Example
39
+ ```ruby
40
+ # search for nodes by css selector
41
+ # tag : css('name')
42
+ # class : css('.name')
43
+ # id : css('#name')
25
44
  doc.css('div')
26
45
  doc.css('.main-text')
27
46
  doc.css('#tadjs')
28
47
 
29
- # Search for nodes by xpath
48
+ # search for nodes by xpath
30
49
  doc.xpath('//*[@id="top"]/div[1]')
31
50
 
32
- # Others
33
- doc.css('div').css('a')[2].attr('href')
34
- doc.css('p').innerText()
35
- doc.tables # -> Table Tag to Dict
36
-
51
+ # other example
52
+ doc.css('div').css('a')[2].attr('href') # => string object
53
+ doc.css('p').innerText() # => string object
37
54
  # You do not need to specify "[]" to access the first index
38
55
  ```
39
56
 
57
+ ### Submitting Form Example
58
+ 1. Specify target node's attribute
59
+ 2. Specify value(int or str) / check(bool) / file_name(str)
60
+ 3. call submit() with form attribute specified
61
+ ```ruby
62
+ # login
63
+ doc.send(id:'id attribute', value:'value to send')
64
+ doc.send(id:'id attribute', value:'value to send')
65
+ doc.submit(id:'id attribute') # submit
66
+
67
+ # post file
68
+ doc.send(id:'id attribute', file_name:'target file name')
69
+
70
+ # checkbox
71
+ doc.send(id:'id attribute', check:True) # check
72
+ doc.send(id:'id attribute', check:False) # uncheck
73
+
74
+ # example of specify other attribute
75
+ doc.send(name:'name attribute', value:'hello')
76
+ doc.send(class:'class attribute', value:100)
77
+ ```
78
+
79
+
80
+
40
81
 
41
82
  ## Installation
42
83
  ```sh
43
84
  $ gem install husc
44
- ```
85
+ ```
86
+
87
+
88
+ ## Contributing
89
+ Bug reports and pull requests are welcome on GitHub at [https://github.com/AjxLab/PyCrawl](https://github.com/AjxLab/PyCrawl).
data/husc.gemspec CHANGED
@@ -13,7 +13,7 @@ Gem::Specification.new do |spec|
13
13
  spec.required_ruby_version = Gem::Requirement.new(">= 2.3.0")
14
14
 
15
15
  spec.metadata["homepage_uri"] = spec.homepage
16
- spec.metadata["source_code_uri"] = "https://github.com/AjxLab/Crawler."
16
+ spec.metadata["source_code_uri"] = "https://github.com/AjxLab/husc"
17
17
 
18
18
  # Specify which files should be added to the gem when it is released.
19
19
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
data/lib/husc/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Husc
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
data/lib/husc.rb CHANGED
@@ -3,11 +3,12 @@ require 'mechanize'
3
3
  require 'nokogiri'
4
4
  require 'net/http'
5
5
  require 'kconv'
6
- require "husc/version"
6
+ require 'husc/version'
7
7
 
8
8
  module Husc
9
9
  class Error < StandardError; end
10
- class Husc
10
+
11
+ class Crawler
11
12
  attr_reader :url, :html, :tables, :params
12
13
 
13
14
  # 特殊配列
@@ -28,7 +29,7 @@ module Husc
28
29
 
29
30
  def method_missing(method, *args)
30
31
  if self == []
31
- return eval("Husc.new(doc: nil).#{method}(*#{args})")
32
+ return eval("Crawler.new(doc: nil).#{method}(*#{args})")
32
33
  end
33
34
 
34
35
  return eval("self[0].#{method}(*#{args})")
@@ -118,7 +119,7 @@ module Husc
118
119
 
119
120
  def xpath(locator, single = false)
120
121
  ## -----*----- HTMLからXPath指定で要素取得 -----*----- ##
121
- elements = CrawlArray.new(@doc.xpath(locator).map {|el| Husc.new(doc: el)})
122
+ elements = CrawlArray.new(@doc.xpath(locator).map {|el| Crawler.new(doc: el)})
122
123
  if single
123
124
  # シングルノード
124
125
  if elements[0] == nil
@@ -134,7 +135,7 @@ module Husc
134
135
 
135
136
  def css(locator, single = false)
136
137
  ## -----*----- HTMLからCSSセレクタで要素取得 -----*----- ##
137
- elements = CrawlArray.new(@doc.css(locator).map {|el| Husc.new(doc: el)})
138
+ elements = CrawlArray.new(@doc.css(locator).map {|el| Crawler.new(doc: el)})
138
139
  if single
139
140
  # シングルノード
140
141
  if elements[0] == nil
@@ -211,3 +212,4 @@ module Husc
211
212
  end
212
213
  end
213
214
  end
215
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: husc
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tatsuya Abe
@@ -1077,7 +1077,7 @@ licenses:
1077
1077
  - MIT
1078
1078
  metadata:
1079
1079
  homepage_uri: https://github.com/AjxLab/husc
1080
- source_code_uri: https://github.com/AjxLab/Crawler.
1080
+ source_code_uri: https://github.com/AjxLab/husc
1081
1081
  post_install_message:
1082
1082
  rdoc_options: []
1083
1083
  require_paths: